diff --git a/.github/workflows/contanerize.yaml b/.github/workflows/contanerize.yaml index ef3ba1b7..b4bad35f 100644 --- a/.github/workflows/contanerize.yaml +++ b/.github/workflows/contanerize.yaml @@ -9,8 +9,11 @@ on: branches: - main - dev + - dv_dev - dev_eco - # - dev_dagster142 + - v0_generated_code + - 133_dev_sitemaps + - 151-integrate-community-stats-codes tags: - "v*.*.*" @@ -154,7 +157,44 @@ jobs: type=ref,event=branch type=semver,pattern={{version}} type=sha - + build_code_workflows: + name: Dockerize Scheduler Workflows base + runs-on: ubuntu-latest + #strategy: + #matrix: + # project: [ "eco" ] + #project: [ "eco", "iow", "oih" ] + #platform: ["linux/amd64","linux/arm64"] + #platform: ["linux/amd64"] #linux/arm64 issues with building + steps: + - name: Set variables + run: | + REGISTRY_IMAGE=nsfearthcube/dagster-gleanerio-workflows + echo "REGISTRY_IMAGE=$REGISTRY_IMAGE" >> $GITHUB_ENV + working-directory: / + - name: Checkout Repo + uses: actions/checkout@v3 + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - name: Login to DockerHub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v4 + with: + images: ${{ env.REGISTRY_IMAGE }} + flavor: | + latest=true + tags: | + type=ref,event=tag + type=ref,event=branch + type=semver,pattern={{version}} + type=sha # - name: Set up Python 3.10 # uses: actions/setup-python@v4 # with: @@ -201,7 +241,7 @@ jobs: build-args: implnet=${{ matrix.project }} #file: ./dagster/implnets/build/Dockerfile - file: ./build/Dockerfile_code + file: ./build/Dockerfile_workflows context: "{{defaultContext}}:dagster/implnets" tags: ${{ steps.meta.outputs.tags }} # tags: nsfearthcube/ec_facets_client:latest diff --git a/.gitignore b/.gitignore index 75daa638..ae7c143f 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,10 @@ venv/** /dagster/.telemetry/ /dagster/.telemetry/ .env + +/dagster/implnets/generatedCode/implnet-*/output/ + +/dagster/implnets/deployment/prod.env + +**/tmp** +/dagster/dagster_home/ diff --git a/.idea/misc.xml b/.idea/misc.xml index dd0668a7..fb9c2841 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,7 @@ - - + + + + \ No newline at end of file diff --git a/.idea/scheduler.iml b/.idea/scheduler.iml index 49c6a476..1f267fea 100644 --- a/.idea/scheduler.iml +++ b/.idea/scheduler.iml @@ -4,11 +4,11 @@ - + - + \ No newline at end of file diff --git a/NOTES.md b/NOTES.md index d3ce74cd..3864eda4 100644 --- a/NOTES.md +++ b/NOTES.md @@ -1,5 +1,8 @@ # Notes +need to do dynamic ops.assets +https://medium.com/@thegreat.rashid83/dagster-sensors-partition-c7a5205d4c0d + ## Development At the top level (dagster/implents) you can run @@ -28,3 +31,11 @@ will run just the task, and in editable form, i think. ## Some articles to review [Medium on Dagster with configurable API and asset examples](https://medium.com/@alexandreguitton_12701/notes-1-2-dagster-data-orchestrator-hands-on-2af6772b13d9) + +## Troubleshooting. +Keep the python versions in the DOCKER definitions in sync. GRPCC can be finicky + +aka: + +`FROM python:3.11-slim` + diff --git a/README.md b/README.md index 93bf8217..ba39b135 100644 --- a/README.md +++ b/README.md @@ -10,5 +10,8 @@ structured data on the web. Details of the approach can be found in the [github io](https://earthcube.github.io/scheduler/). +NOTE: Generate code brach v0_generated_code branch +This is the original code that utilized a generate code approach to build the workflows. +v0_generated_code is where gleaner and nabu config file updates should be done when using the original code diff --git a/dagster/dagster_home/.gitkeep b/dagster/dagster_home/.gitkeep new file mode 100644 index 00000000..79083c9a --- /dev/null +++ b/dagster/dagster_home/.gitkeep @@ -0,0 +1 @@ +This is a place where dagster.yamls can be kept for runs diff --git a/dagster/dagster_home/dagster.yaml b/dagster/dagster_home/dagster.yaml new file mode 100644 index 00000000..35033656 --- /dev/null +++ b/dagster/dagster_home/dagster.yaml @@ -0,0 +1,23 @@ +local_artifact_storage: + module: dagster.core.storage.root + class: LocalArtifactStorage + config: + base_dir: /Users/valentin/development/dev_earthcube/scheduler/dagster/dagster_home/ +run_coordinator: + module: dagster.core.run_coordinator + class: QueuedRunCoordinator + config: + max_concurrent_runs: 6 + # getting tags by copying from UI + tag_concurrency_limits: + - key: "ingest" + value: "docker" + limit: 3 + - key: "ingest" + value: "report" + limit: 2 + - key: "tenant_load" + value: "graph" + limit: 1 +telemetry: + enabled: false diff --git a/dagster/implnets/Makefile b/dagster/implnets/Makefile index 9f4778bb..a3f7aa4b 100644 --- a/dagster/implnets/Makefile +++ b/dagster/implnets/Makefile @@ -2,6 +2,14 @@ .SHELLFLAGS += -e VERSION :=`cat VERSION` +# ---- workflows ---- +# no code generation is neede for workflows + +wf-build: + podman build --tag="docker.io/fils/dagster_wf:$(VERSION)" --build-arg implnet=eco --file=./build/Dockerfile_workflows . + +wf-push: + podman push docker.io/fils/dagster_wf:$(VERSION) # ---- ECO ---- diff --git a/dagster/implnets/VERSION b/dagster/implnets/VERSION index 9c3f756d..6e8bf73a 100644 --- a/dagster/implnets/VERSION +++ b/dagster/implnets/VERSION @@ -1 +1 @@ -0.0.67 +0.1.0 diff --git a/dagster/implnets/__init__.py b/dagster/implnets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dagster/implnets/build/Dockerfile_code b/dagster/implnets/build/Dockerfile_code index 1c6cf0a3..8b82293d 100644 --- a/dagster/implnets/build/Dockerfile_code +++ b/dagster/implnets/build/Dockerfile_code @@ -12,10 +12,11 @@ RUN mkdir -p /usr/src/app/workflows RUN pip install --upgrade pip ## this is a base for the project. Build this 'layer' first -COPY ./requirements_code.txt requirements.txt +COPY ./requirements.txt requirements.txt RUN pip install -r requirements.txt # this add the code +# this is only needed because we generate the code with pygen. otherwise added in compose-project.yaml docker compose COPY . scheduler COPY ./configs/${implnet}/gleanerconfig.yaml scheduler/gleanerconfig.yaml diff --git a/dagster/implnets/build/Dockerfile_local b/dagster/implnets/build/Dockerfile_local index acd36f34..98f40589 100644 --- a/dagster/implnets/build/Dockerfile_local +++ b/dagster/implnets/build/Dockerfile_local @@ -25,4 +25,4 @@ WORKDIR /usr/src/app ENV DAGSTER_HOME=/usr/src/app -CMD ["dagster-webserver", "-w", "./project/${implnet}/workspace.yaml", "-h", "0.0.0.0", "-p", "3000"] +CMD [ "dagster", "api","grpc", "-h", "0.0.0.0", "-p", "4000", "-m", "workflows.tasks.tasks", "-d", "/usr/src/app/"] diff --git a/dagster/implnets/build/Dockerfile_workflows b/dagster/implnets/build/Dockerfile_workflows new file mode 100644 index 00000000..24a70764 --- /dev/null +++ b/dagster/implnets/build/Dockerfile_workflows @@ -0,0 +1,43 @@ +FROM python:3.11-slim + + +# this file no longer needs to generate code. It will just include the base +# it will run ingest by default +# we may want to get an unreleased version of code, so this is needed + +RUN apt-get update && apt-get install -y git +RUN pip install --upgrade pip +RUN apt-get install -y gcc musl-dev python3-dev +#RUN apt-get install libffi-dev +# Read the ARG implnet to set who to build for. + +# docker buildandpush pulls the repo, so we need to put the code at a different location +# this fails becaus the dagster/implnets files are not in the docker +ARG implnet=eco + +RUN mkdir -p /usr/src/app/workflows + + +## this is a base for the project. Build this 'layer' first +COPY ./requirements.txt requirements.txt +RUN pip install -r requirements.txt + +# this add the code +COPY . scheduler +#COPY ./configs/${implnet}/gleanerconfig.yaml scheduler/gleanerconfig.yaml + +COPY ./deployment/dagster.yaml /usr/src/app/ + +WORKDIR scheduler + + +COPY ./workflows/ /usr/src/app/workflows + + + +# Change working directory +WORKDIR /usr/src/app +ENV DAGSTER_HOME=/usr/src/app + + +CMD [ "dagster", "api","grpc", "-h", "0.0.0.0", "-p", "4000", "-m", "workflows.tasks.tasks", "-d", "/usr/src/app/"] diff --git a/dagster/implnets/configs/a_test/headless/.gitkeep b/dagster/implnets/configs/a_test/headless/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/dagster/implnets/configs/eco/gleanerconfig.yaml b/dagster/implnets/configs/eco/gleanerconfig.yaml index 6ec78244..570da029 100644 --- a/dagster/implnets/configs/eco/gleanerconfig.yaml +++ b/dagster/implnets/configs/eco/gleanerconfig.yaml @@ -1,6 +1,6 @@ context: cache: true - strict: true + strict: false contextmaps: - prefix: "https://schema.org/" file: "/assets/schemaorg-current-https.jsonld" # wget http://schema.org/docs/jsonldcontext.jsonld @@ -44,9 +44,610 @@ sources: identifierpath: "" apipagelimit: 0 identifiertype: identifiersha + fixcontextoption: 1 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitegraph + name: aquadocs + logo: "" + url: https://oih.aquadocs.org/aquadocs.json + headless: false + pid: http://hdl.handle.net/1834/41372 + propername: AquaDocs + domain: https://aquadocs.org + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: bcodmo + logo: https://www.bco-dmo.org/sites/all/themes/bcodmo/logo.png + url: https://www.bco-dmo.org/sitemap.xml + headless: false + pid: https://www.re3data.org/repository/r3d100000012 + propername: Biological and Chemical Oceanography Data Management Office + domain: http://www.bco-dmo.org/ + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: cchdo + logo: https://cchdo.ucsd.edu/static/svg/logo_cchdo.svg + url: https://cchdo.ucsd.edu/sitemap.xml + headless: false + pid: https://www.re3data.org/repository/r3d100010831 + propername: CLIVAR and Carbon Hydrographic Data Office + domain: https://cchdo.ucsd.edu/ + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: datadiscoverystudio + logo: http://datadiscoverystudio.org/geoportal/images/DataDiscoveryStudioBufferedWhite.png + url: http://datadiscoverystudio.org/sitemap/CinergiSiteIndex.xml + headless: false + pid: "" + propername: "" + domain: http://datadiscoverystudio.org/geoportal + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: designsafe + logo: "" + url: https://www.designsafe-ci.org/sitemap.xml + headless: false + pid: "" + propername: "" + domain: https://www.designsafe-ci.org/data/browser/public/ + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: earthchem + logo: http://www.earthchem.org/sites/default/files/files/EC_0-1.png + url: https://ecl.earthchem.org/sitemap.xml + headless: false + pid: https://www.re3data.org/repository/r3d100011538 + propername: earthchem + domain: https://ecl.earthchem.org/home.php + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: $.sameAs + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: ecrr_examples + logo: https://www.earthcube.org/sites/default/files/doc-repository/logo_earthcube_full_horizontal.png + url: https://earthcube.github.io/ecrro/Examples/sitemap.xml + headless: false + pid: http://www.earthcube.org/resourceregistry/examples + propername: Earthcube Resource Registry Examples + domain: http://www.earthcube.org/resourceregistry/examples + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifierstring + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: edi + logo: https://portal.edirepository.org/nis/images/EDI-logo-300DPI_5.png + url: https://portal.edirepository.org/sitemap_index.xml + headless: false + pid: https://www.re3data.org/repository/r3d100010272 + propername: Environmental Data Initiative + domain: 'http://environmentaldatainitiative.org/ ' + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: geocodes_demo_datasets + logo: "" + url: https://earthcube.github.io/GeoCODES-Metadata/metadata/Dataset/allgood/sitemap.xml + headless: false + pid: https://github.com/earthcube/GeoCODES-Metadata/metadata/OtherResources + propername: Geocodes Demo Datasets + domain: https://www.earthcube.org/datasets/ + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: geocodes_examples + logo: "" + url: https://earthcube.github.io/GeoCODES-Metadata/metadata/Dataset/allgood/sitemap.xml + headless: false + pid: https://github.com/earthcube/GeoCODES-Metadata/ + propername: GeoCodes Tools Examples + domain: https://github.com/earthcube/GeoCODES-Metadata/ + active: true + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifierstring + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: hydroshare + logo: https://www.hydroshare.org/static/img/logo-lg.png + url: https://www.hydroshare.org/sitemap-resources.xml + headless: false + pid: https://www.re3data.org/repository/r3d100012625 + propername: Consortium of Universities for the Advancement of Hydrologic Science, Inc. (CUAHSI) + domain: https://www.cuahsi.org/ + active: false + credentialsfile: "" + other: { } + headlesswait: -1 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: iedadata + logo: https://www.iedadata.org/wp-content/themes/IEDA/assets/img/logo.png + url: http://get.iedadata.org/doi/xml-sitemap.php + headless: false + pid: https://www.re3data.org/repository/r3d100010578 + propername: IEDA (Integrated Earth Data Applications) + domain: http://www.iedadata.org/ + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: $.sameAs + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: iris + logo: http://ds.iris.edu/static/img/layout/logos/iris_logo_shadow.png + url: http://ds.iris.edu/files/sitemap.xml + headless: false + pid: https://www.re3data.org/repository/r3d100010268 + propername: IRIS + domain: http://iris.edu + active: true + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: linkedearth + logo: http://wiki.linked.earth/wiki/images/thumb/5/51/EarthLinked_Banner_blue_NoShadow.jpg/440px-EarthLinked_Banner_blue_NoShadow.jpg + url: http://wiki.linked.earth/sitemap.xml + headless: false + pid: http://wiki.linked.earth + propername: Linked Earth + domain: http://wiki.linked.earth + active: false + credentialsfile: "" + other: { } + headlesswait: -1 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: lipdverse + logo: "" + url: https://lipdverse.org/sitemap.xml + headless: false + pid: https://lipdverse.org + propername: Linked PaleoData + domain: https://lipdverse.org/ + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: magic + logo: http://mbobak.ncsa.illinois.edu/ext/ec/magic/MagIC.png + url: https://www2.earthref.org/MagIC/contributions.sitemap.xml + headless: true + pid: http://www.re3data.org/repository/r3d100011910 + propername: Magnetics Information Consortium (MagIC) + domain: https://www.earthref.org/MagIC + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: $.sameAs + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: neon + logo: https://www.neonscience.org/themes/custom/neon/logo.svg + url: https://geodex.org/neon_prodcodes_sm.xml + headless: false + pid: http://www.re3data.org/repository/r3d100010290 + propername: National Ecological Observatory Network (NEON) + domain: http://www.neonscience.org/ + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: neotomadb + logo: https://www.neotomadb.org/images/site_graphics/Packrat.png + url: http://data.neotomadb.org/sitemap.xml + headless: true + pid: http://www.re3data.org/repository/r3d100011761 + propername: Neotoma + domain: http://www.neotomadb.org/ + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: opencoredata + logo: https://opencoredata.org/img/logo22small.png + url: http://opencoredata.org/sitemap.xml + headless: false + pid: https://www.re3data.org/repository/r3d100012874 + propername: opencoredata + domain: https://opencoredata.org/ + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: opentopography + logo: https://opentopography.org/sites/opentopography.org/files/ot_transp_logo_2.png + url: https://portal.opentopography.org/sitemap.xml + headless: false + pid: https://www.re3data.org/repository/r3d100010655 + propername: OpenTopography + domain: http://www.opentopography.org/ + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: r2r + logo: https://www.rvdata.us/images/Logo.4b1519be.png + url: https://service-dev.rvdata.us/api/sitemap/ + headless: true + pid: http://www.re3data.org/repository/r3d100010735 + propername: Rolling Deck to Repository Program (R2R) + domain: https://www.rvdata.us/ + active: false + credentialsfile: "" + other: { } + headlesswait: 5 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: resource_registry + logo: https://www.earthcube.org/sites/default/files/doc-repository/logo_earthcube_full_horizontal.png + url: https://object.cloud.sdsc.edu/v1/AUTH_85f46aa78936477d8e71b186269414e8/gleaner-summoned + headless: false + pid: "" + propername: Resource Registry + domain: http://www.earthcube.org/resourceregistry/ + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: $.@id + apipagelimit: 0 + identifiertype: identifierstring + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: ssdbiodp + logo: http://ssdb.iodp.org/images/head_logo_PRO.gif + url: https://ssdb.iodp.org/dataset/sitemap.xml + headless: false + pid: https://www.re3data.org/repository/r3d100010267 + propername: IODP Site Survey Databank + domain: https://ssdb.iodp.org/ + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: ucar + logo: https://opensky.ucar.edu/islandora/object/opensky%3Aucommunity/datastream/TN/view + url: https://data.ucar.edu/sitemap.xml + headless: false + pid: https://www.re3data.org/repository/r3d100010791 + propername: UCAR + domain: https://data.ucar.edu + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: unavco + logo: https://www.unavco.org/lib/images/banner/uv-logo.png + url: https://www.unavco.org/data/doi/sitemap.xml + headless: false + pid: http://www.re3data.org/repository/r3d100010872 + propername: UNAVCO + domain: http://www.unavco.org/ + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: unidata + logo: "" + url: https://www.unidata.ucar.edu/sitemap.xml + headless: false + pid: https://www.re3data.org/repository/r3d100010355 + propername: UNIDATA + domain: http://www.unidata.ucar.edu/ + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: usapdc + logo: https://www.usap-dc.org/ + url: https://www.usap-dc.org/view/dataset/sitemap.xml + headless: true + pid: https://www.re3data.org/repository/r3d100010660 + propername: U.S. Antarctic Program Data Center + domain: https://www.usap-dc.org/ + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: wifire + logo: https://wifire-data.sdsc.edu/uploads/admin/2021-04-22-203649.712143WIFIRECOMMONSSMRES12.png + url: https://wifire-data.sdsc.edu/sitemap.xml + headless: false + pid: https://wifire-data.sdsc.edu/ + propername: WIFIRE Commons + domain: https://wifire-data.sdsc.edu/ + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: hydrography90m + logo: "" + url: https://raw.githubusercontent.com/earthcube/communityCollections/master/collection/hydrography90m/sitemaps/hydrography90m.xml + headless: false + pid: https://hydrography.org/hydrography90m/hydrography90m_layers + propername: Hydrography90m + domain: https://hydrography.org/hydrography90m/hydrography90m_layers + active: true + credentialsfile: "" + other: {} + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha fixcontextoption: 0 acceptcontenttype: application/ld+json, text/html jsonprofile: application/ld+json + - sourcetype: sitemap + name: neoncontext: + cache: true + strict: false +contextmaps: + - prefix: "https://schema.org/" + file: "/assets/schemaorg-current-https.jsonld" # wget http://schema.org/docs/jsonldcontext.jsonld + - prefix: "http://schema.org/" + file: "/assets/schemaorg-current-http.jsonld" # wget http://schema.org/docs/jsonldcontext.jsonld +gleaner: + mill: true + runid: runX + summon: true +summoner: + after: "" + delay: # milliseconds (1000 = 1 second) to delay between calls (will FORCE threads to 1) + # will headless, the name of the container "service" work + headless: http://headless:9222 + #headless: http://localhost:9222 + mode: full + threads: 5 +millers: + graph: true +minio: + address: + port: + ssl: + accessKey: + secretKey: + bucket: +sources: + - sourcetype: sitemap + name: amgeo + logo: https://amgeo.colorado.edu/static/img/amgeosmall.svg + url: https://amgeo-dev.colorado.edu/sitemap.xml + headless: false + pid: "" + propername: Assimilative Mapping of Geospace Observations + domain: https://amgeo.colorado.edu/ + active: false + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 1 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json - sourcetype: sitegraph name: aquadocs logo: "" @@ -131,7 +732,7 @@ sources: pid: "" propername: "" domain: https://www.designsafe-ci.org/data/browser/public/ - active: true + active: false credentialsfile: "" other: { } headlesswait: 0 @@ -150,7 +751,7 @@ sources: pid: https://www.re3data.org/repository/r3d100011538 propername: earthchem domain: https://ecl.earthchem.org/home.php - active: true + active: false credentialsfile: "" other: { } headlesswait: 0 @@ -164,12 +765,12 @@ sources: - sourcetype: sitemap name: ecrr_examples logo: https://www.earthcube.org/sites/default/files/doc-repository/logo_earthcube_full_horizontal.png - url: https://raw.githubusercontent.com/earthcube/ecrro/master/Examples/sitemap.xml + url: https://earthcube.github.io/ecrro/Examples/sitemap.xml headless: false pid: http://www.earthcube.org/resourceregistry/examples propername: Earthcube Resource Registry Examples domain: http://www.earthcube.org/resourceregistry/examples - active: true + active: false credentialsfile: "" other: { } headlesswait: 0 @@ -207,7 +808,7 @@ sources: pid: https://github.com/earthcube/GeoCODES-Metadata/metadata/OtherResources propername: Geocodes Demo Datasets domain: https://www.earthcube.org/datasets/ - active: true + active: false credentialsfile: "" other: { } headlesswait: 0 @@ -245,7 +846,7 @@ sources: pid: https://www.re3data.org/repository/r3d100012625 propername: Consortium of Universities for the Advancement of Hydrologic Science, Inc. (CUAHSI) domain: https://www.cuahsi.org/ - active: true + active: false credentialsfile: "" other: { } headlesswait: -1 @@ -264,7 +865,7 @@ sources: pid: https://www.re3data.org/repository/r3d100010578 propername: IEDA (Integrated Earth Data Applications) domain: http://www.iedadata.org/ - active: true + active: false credentialsfile: "" other: { } headlesswait: 0 @@ -302,7 +903,7 @@ sources: pid: http://wiki.linked.earth propername: Linked Earth domain: http://wiki.linked.earth - active: true + active: false credentialsfile: "" other: { } headlesswait: -1 @@ -340,7 +941,7 @@ sources: pid: http://www.re3data.org/repository/r3d100011910 propername: Magnetics Information Consortium (MagIC) domain: https://www.earthref.org/MagIC - active: true + active: false credentialsfile: "" other: { } headlesswait: 0 @@ -378,7 +979,7 @@ sources: pid: http://www.re3data.org/repository/r3d100011761 propername: Neotoma domain: http://www.neotomadb.org/ - active: true + active: false credentialsfile: "" other: { } headlesswait: 0 @@ -416,7 +1017,7 @@ sources: pid: https://www.re3data.org/repository/r3d100010655 propername: OpenTopography domain: http://www.opentopography.org/ - active: true + active: false credentialsfile: "" other: { } headlesswait: 0 @@ -473,7 +1074,7 @@ sources: pid: https://www.re3data.org/repository/r3d100010267 propername: IODP Site Survey Databank domain: https://ssdb.iodp.org/ - active: true + active: false credentialsfile: "" other: { } headlesswait: 0 @@ -511,7 +1112,7 @@ sources: pid: http://www.re3data.org/repository/r3d100010872 propername: UNAVCO domain: http://www.unavco.org/ - active: true + active: false credentialsfile: "" other: { } headlesswait: 0 @@ -549,7 +1150,7 @@ sources: pid: https://www.re3data.org/repository/r3d100010660 propername: U.S. Antarctic Program Data Center domain: https://www.usap-dc.org/ - active: true + active: false credentialsfile: "" other: { } headlesswait: 0 @@ -568,7 +1169,7 @@ sources: pid: https://wifire-data.sdsc.edu/ propername: WIFIRE Commons domain: https://wifire-data.sdsc.edu/ - active: true + active: false credentialsfile: "" other: { } headlesswait: 0 @@ -579,4 +1180,191 @@ sources: fixcontextoption: 0 acceptcontenttype: application/ld+json, text/html jsonprofile: application/ld+json - + - sourcetype: sitemap + name: hydrography90m + logo: "" + url: https://raw.githubusercontent.com/earthcube/communityCollections/master/collection/hydrography90m/sitemaps/hydrography90m.xml + headless: false + pid: https://hydrography.org/hydrography90m/hydrography90m_layers + propername: Hydrography90m + domain: https://hydrography.org/hydrography90m/hydrography90m_layers + active: true + credentialsfile: "" + other: {} + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: neon4cast + logo: "" + url: https://raw.githubusercontent.com/earthcube/stacIndexer/yl_dv/data/output/sitemap/sitemap_neon4cast.xml + headless: false + pid: https://projects.ecoforecast.org/neon4cast-ci/ + propername: NEON Ecological Forecast Challenge + domain: https://projects.ecoforecast.org/neon4cast-ci/ + active: true + credentialsfile: "" + other: {} + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: usgsrc4cast + logo: "" + url: https://raw.githubusercontent.com/earthcube/stacIndexer/yl_dv/data/output/sitemap/sitemap_usgsrc4cast.xml + headless: false + pid: https://github.com/eco4cast/usgsrc4cast-ci + propername: Ecological Forecasting Initiative (EFI) and U.S. Geological Survey (USGS) River Chlorophyll Forecasting Challenge + domain: https://github.com/eco4cast/usgsrc4cast-ci + active: true + credentialsfile: "" + other: {} + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: vera4cast + logo: "" + url: https://raw.githubusercontent.com/earthcube/stacIndexer/yl_dv/data/output/sitemap/sitemap_vera4cast.xml + headless: false + pid: https://github.com/LTREB-reservoirs/vera4cast + propername: Virginia Ecoforecast Reservoir Analysis (VERA) Ecological Forecasting Challenge + domain: https://github.com/LTREB-reservoirs/vera4cast + active: true + credentialsfile: "" + other: {} + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json4cast + logo: "" + url: https://raw.githubusercontent.com/earthcube/stacIndexer/yl_dv/data/output/sitemap/sitemap_neon4cast.xml + headless: false + pid: https://projects.ecoforecast.org/neon4cast-ci/ + propername: NEON Ecological Forecast Challenge + domain: https://projects.ecoforecast.org/neon4cast-ci/ + active: true + credentialsfile: "" + other: {} + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: usgsrc4cast + logo: "" + url: https://raw.githubusercontent.com/earthcube/stacIndexer/yl_dv/data/output/sitemap/sitemap_usgsrc4cast.xml + headless: false + pid: https://github.com/eco4cast/usgsrc4cast-ci + propername: Ecological Forecasting Initiative (EFI) and U.S. Geological Survey (USGS) River Chlorophyll Forecasting Challenge + domain: https://github.com/eco4cast/usgsrc4cast-ci + active: true + credentialsfile: "" + other: {} + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: vera4cast + logo: "" + url: https://raw.githubusercontent.com/earthcube/stacIndexer/yl_dv/data/output/sitemap/sitemap_vera4cast.xml + headless: false + pid: https://github.com/LTREB-reservoirs/vera4cast + propername: Virginia Ecoforecast Reservoir Analysis (VERA) Ecological Forecasting Challenge + domain: https://github.com/LTREB-reservoirs/vera4cast + active: true + credentialsfile: "" + other: {} + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: osmc + logo: "" + url: https://osmc.noaa.gov/erddap/sitemap.xml + headless: false + pid: https://osmc.noaa.gov/erddap + propername: Easier access to scientific data + domain: https://osmc.noaa.gov/erddap + active: true + credentialsfile: "" + other: {} + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: obis + logo: https://obis.org/images/logo.png + url: https://obis-sitemaps.s3.amazonaws.com/sitemap_datasets.xml + headless: false + pid: https://catalogue.odis.org/view/343 + propername: Ocean Biodiversity Information System (OBIS) + domain: https://obis.org + active: true + credentialsfile: "" + other: {} + headlesswait: -1 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - sourcetype: sitemap + name: geochemistry_custom + logo: + url: https://oss.geocodes.ncsa.illinois.edu/decoder/sitemaps/geochemistry_sitemap.xml + headless: false + pid: https://catalogue.odis.org/view/343 + propername: Geochemistry Custom Datasets + domain: https://obis.org + active: true + credentialsfile: "" + other: {} + headlesswait: -1 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 0 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json \ No newline at end of file diff --git a/dagster/implnets/configs/eco/nabuconfig.yaml b/dagster/implnets/configs/eco/nabuconfig.yaml index 75ff07e4..3218d9a3 100644 --- a/dagster/implnets/configs/eco/nabuconfig.yaml +++ b/dagster/implnets/configs/eco/nabuconfig.yaml @@ -7,14 +7,34 @@ minio: ssl: true context: cache: true - strict: true + strict: false contextmaps: - prefix: "https://schema.org/" file: "/assets/schemaorg-current-https.jsonld" # wget http://schema.org/docs/jsonldcontext.jsonld - prefix: "http://schema.org/" file: "/assets/schemaorg-current-http.jsonld" # wget http://schema.org/docs/jsonldcontext.jsonld -sparql: - endpoint: +implementation_network: + orgname: eco +endpoints: + - service: ec_blazegraph + baseurl: https://graph.geocodes-aws-dev.earthcube.org/blazegraph/namespace/test + type: blazegraph + authenticate: false + username: + password: + modes: + - action: sparql + suffix: /sparql + accept: application/sparql-results+json + method: GET + - action: update + suffix: /sparql + accept: application/sparql-update + method: POST + - action: bulk + suffix: /sparql + accept: text/x-nquads + method: POST objects: domain: us-east-1 prefix: @@ -30,6 +50,17 @@ objects: - summoned/r2r - summoned/ssdbiodp - summoned/unavco + - summoned/glim + - summoned/gpp + - summoned/nitrogen + - summoned/nitrogen2 + - summoned/hydrography90m + - summoned/neon4cast + - summoned/usgsrc4cast + - summoned/vera4cast + - summoned/osmc + - summoned/obis + - summoned/geochemistry_custom - prov/aquadocs - prov/bcodmo - prov/cchdo diff --git a/dagster/implnets/configs/eco/tenant.yaml b/dagster/implnets/configs/eco/tenant.yaml new file mode 100644 index 00000000..8993833c --- /dev/null +++ b/dagster/implnets/configs/eco/tenant.yaml @@ -0,0 +1,41 @@ +# prototype tennants file + +# prototype tennants file + +tenant: + - community: dev + hostname: geocodes-dev + description: GeoCodes is... + name: Geocodes Science on Schema + url: https://www.earthcube.org + logo: https://unsplash.com/random + graph: + main_namespace: test + summary_namespace: test_summary + sources: + - iris + - geocodes_demo_datasets +###### + - community: geocodesall + hostname: geocodes-all + description: GeoCodes is... + name: Geocodes Science on Schema + url: https://www.earthcube.org + logo: https://unsplash.com/random + graph: + main_namespace: geocodes_test + summary_namespace: geocodes_test_summary + sources: + - all +# - community: dev3 +# hostname: geocodes-dev32 +# description: GeoCodes is... +# name: Geocodes Science on Schema +# url: https://www.earthcube.org +# logo: https://unsplash.com/random +# graph: +# main_namespace: test3 +# summary_namespace: test3_summary +# sources: +# - iris +# - geocodes_examples diff --git a/dagster/implnets/configs/eco/workspace.yaml b/dagster/implnets/configs/eco/workspace.yaml index b482024f..286e70e4 100644 --- a/dagster/implnets/configs/eco/workspace.yaml +++ b/dagster/implnets/configs/eco/workspace.yaml @@ -14,10 +14,14 @@ load_from: port: 4000 location_name: "tasks" - grpc_server: - host: dagster-code-project + host: dagster-code-ingest port: 4000 - location_name: "project_grpc" + location_name: "ingest" +# - grpc_server: +# host: dagster-code-project +# port: 4000 +# location_name: "project_grpc" - grpc_server: host: dagster-code-eco-ecrr port: 4000 - location_name: "project_ecrr" + location_name: "ecrr" diff --git a/dagster/implnets/configs/ecrr/gleanerconfig.yaml b/dagster/implnets/configs/ecrr/gleanerconfig.yaml index 69fa76e5..fa4fbce4 100644 --- a/dagster/implnets/configs/ecrr/gleanerconfig.yaml +++ b/dagster/implnets/configs/ecrr/gleanerconfig.yaml @@ -20,6 +20,26 @@ minio: accesskey: secretkey: sources: + # not a sitemap... an s3 directory.. readable by nabu... but this will be difficult. +# - sourcetype: sitemap +# name: resource_registry +# logo: https://www.earthcube.org/sites/default/files/doc-repository/logo_earthcube_full_horizontal.png +# url: https://object.cloud.sdsc.edu/v1/AUTH_85f46aa78936477d8e71b186269414e8/gleaner-summoned +# headless: false +# pid: "" +# propername: Resource Registry +# domain: http://www.earthcube.org/resourceregistry/ +# active: true +# credentialsfile: "" +# other: { } +# headlesswait: 0 +# delay: 0 +# identifierpath: $.@id +# apipagelimit: 0 +# identifiertype: identifierstring +# fixcontextoption: 0 +# acceptcontenttype: application/ld+json, text/html +# jsonprofile: application/ld+json - sourcetype: googledrive name: ecrr_submitted logo: https://www.earthcube.org/sites/default/files/doc-repository/logo_earthcube_full_horizontal.png @@ -42,7 +62,7 @@ sources: - sourcetype: sitemap name: ecrr_examples logo: https://www.earthcube.org/sites/default/files/doc-repository/logo_earthcube_full_horizontal.png - url: https://raw.githubusercontent.com/earthcube/ecrro/gh-pages/Examples/sitemap.xml + url: https://earthcube.github.io/ecrro/Examples/sitemap.xml headless: false pid: "" propername: Earthcube Resource Registry Examples diff --git a/dagster/implnets/configs/ecrr/nabuconfing.yaml b/dagster/implnets/configs/ecrr/nabuconfing.yaml index dc65407d..9631a62e 100644 --- a/dagster/implnets/configs/ecrr/nabuconfing.yaml +++ b/dagster/implnets/configs/ecrr/nabuconfing.yaml @@ -15,10 +15,27 @@ objects: - org prefixoff: - summoned/ecrr_examples -sparql: - endpoint: http://localhost/blazegraph/namespace/earthcube/sparql +implementation_network: + orgname: ecrr +endpoints: + - service: ec_blazegraph + baseurl: http://localhost/blazegraph/namespace/earthcube/sparql + type: blazegraph authenticate: false - username: "" - password: "" + username: + password: + modes: + - action: sparql + suffix: /sparql + accept: application/sparql-results+json + method: GET + - action: update + suffix: /sparql + accept: application/sparql-update + method: POST + - action: bulk + suffix: /sparql + accept: text/x-nquads + method: POST txtaipkg: endpoint: http://0.0.0.0:8000 diff --git a/dagster/implnets/configs/iow/workspace.yaml b/dagster/implnets/configs/iow/workspace.yaml index 22d79140..0b4702cb 100644 --- a/dagster/implnets/configs/iow/workspace.yaml +++ b/dagster/implnets/configs/iow/workspace.yaml @@ -14,9 +14,13 @@ load_from: port: 4000 location_name: "tasks" - grpc_server: - host: dagster-code-project + host: dagster-code-ingest port: 4000 - location_name: "project_grpc" + location_name: "ingest" +# - grpc_server: +# host: dagster-code-project +# port: 4000 +# location_name: "project_grpc" # - grpc_server: # host: dagster-code-iow-ecrr # port: 4000 diff --git a/dagster/implnets/configs/nsdf/workspace.yaml b/dagster/implnets/configs/nsdf/workspace.yaml index 2b92249b..62667004 100644 --- a/dagster/implnets/configs/nsdf/workspace.yaml +++ b/dagster/implnets/configs/nsdf/workspace.yaml @@ -7,5 +7,12 @@ load_from: # relative_path: "workflows/ecrr/repositories/repository.py" # working_directory: "./workflows/ecrr/" # module starting out with the definitions api - - python_module: "workflows.tasks.tasks" + - grpc_server: + host: dagster-code-ingest + port: 4000 + location_name: "ingest" +# - grpc_server: +# host: dagster-code-project +# port: 4000 +# location_name: "project_grpc" diff --git a/dagster/implnets/configs/oih/workspace.yaml b/dagster/implnets/configs/oih/workspace.yaml index 8ceb08fe..ded83cee 100644 --- a/dagster/implnets/configs/oih/workspace.yaml +++ b/dagster/implnets/configs/oih/workspace.yaml @@ -7,5 +7,12 @@ load_from: # relative_path: "workflows/ecrr/repositories/repository.py" # working_directory: "./workflows/ecrr/" # module starting out with the definitions api - - python_module: "workflows.tasks.tasks" + - grpc_server: + host: dagster-code-ingest + port: 4000 + location_name: "ingest" + # - grpc_server: + # host: dagster-code-project + # port: 4000 + # location_name: "project_grpc" diff --git a/dagster/implnets/deployment/compose.yaml b/dagster/implnets/deployment/compose.yaml index 0e1ccba7..3431d06e 100644 --- a/dagster/implnets/deployment/compose.yaml +++ b/dagster/implnets/deployment/compose.yaml @@ -80,7 +80,7 @@ services: - "traefik.http.services.sched-${PROJECT:-eco}.loadbalancer.server.port=3000" - "traefik.docker.network=traefik_proxy" - "traefik.http.middlewares.sched-${PROJECT:-eco}.headers.accesscontrolallowmethods=GET,OPTIONS,POST" - - "traefik.http.middlewares.sched-${PROJECT:-eco}.headers.accesscontrolalloworigin=*" + - "traefik.http.middlewares.sched-${PROJECT:-eco}.headers.accessControlAllowOriginList=*" - "traefik.http.middlewares.sched-${PROJECT:-eco}.headers.accesscontrolmaxage=100" - "traefik.http.middlewares.sched-${PROJECT:-eco}.headers.addvaryheader=true" dagster-daemon: diff --git a/dagster/implnets/deployment/compose_local.yaml b/dagster/implnets/deployment/compose_local.yaml index fd935c45..6ed3d4b7 100644 --- a/dagster/implnets/deployment/compose_local.yaml +++ b/dagster/implnets/deployment/compose_local.yaml @@ -11,15 +11,16 @@ networks: volumes: dagster-postgres: driver: local - +# dagster-storage: +# driver: local # let yourself use local configuration configs: - gleaner: - name: ${GLEANERIO_GLEANER_DOCKER_CONFIG:-gleaner} - file: ../configs/${PROJECT:-eco}/gleanerconfig.yaml - nabu: - name: ${GLEANERIO_NABU_DOCKER_CONFIG:-nabu} - file: ../configs/${PROJECT:-eco}/nabuconfig.yaml +# gleaner: +# name: ${GLEANERIO_DOCKER_GLEANER_CONFIG:-gleaner} +# file: ../configs/${PROJECT:-eco}/gleanerconfig.yaml +# nabu: +# name: ${GLEANERIO_DOCKER_NABU_CONFIG:-nabu} +# file: ../configs/${PROJECT:-eco}/nabuconfig.yaml workspace: name: ${GLEANERIO_WORKSPACE_DOCKER_CONFIG:-workspace} file: ../configs/${PROJECT:-eco}/workspace.yaml @@ -46,49 +47,62 @@ services: # gid: "103" mode: 0444 - - source: gleaner - target: /scheduler/gleanerconfig.yaml - mode: - 0444 - - source: nabu - target: /scheduler/nabuconfig.yaml - # uid: "103" - # gid: "103" - mode: - 044 +# - source: gleaner +# target: /scheduler/gleanerconfig.yaml +# mode: +# 0444 +# - source: nabu +# target: /scheduler/nabuconfig.yaml +# # uid: "103" +# # gid: "103" +# mode: +# 044 volumes: &vol - ../deployment/dagster.yaml:/usr/src/app/dagster.yaml - - ../generatedCode/implnet-${PROJECT:-eco}/output/:/usr/src/app/project/${PROJECT:-eco} + # - ../generatedCode/implnet-${PROJECT:-eco}/output/:/usr/src/app/project/${PROJECT:-eco} - ../workflows/:/usr/src/app/workflows # GLEANEERIO_ the environment variables for this stack, passed into containers # the variables passed into the containers varies due to inconsistent standards. # this there are prefixed by project aka ECO_ for customization # DO NOT RENAME THE FIRST PART, aka the container environment variable, # unless you sure what you are doing + # sort these in BBedit to make finding them easier environment: &env - - DEBUG=${DEBUG:-false} - - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python - - PORTAINER_URL=${PORTAINER_URL} - - PORTAINER_KEY=${PORTAINER_KEY} + - DEBUG_CONTAINER=${DEBUG_CONTAINER:-false} + - GLEANERIO_CONFIG_PATH=${GLEANERIO_CONFIG_PATH:-scheduler/configs/test/} + - GLEANERIO_DAGSTER_CONFIG_PATH=${GLEANERIO_DAGSTER_CONFIG_PATH:-scheduler/logs/} + - GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT=${GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT:-300} + - GLEANERIO_DOCKER_GLEANER_CONFIG=${GLEANERIO_DOCKER_GLEANER_CONFIG:-gleaner} + - GLEANERIO_DOCKER_HEADLESS_NETWORK=${GLEANERIO_DOCKER_HEADLESS_NETWORK} + - GLEANERIO_DOCKER_NABU_CONFIG=${GLEANERIO_DOCKER_NABU_CONFIG:-nabu} + - GLEANERIO_DOCKER_URL=${GLEANERIO_DOCKER_URL} + - GLEANERIO_DOCKER_WORKSPACE_CONFIG=${GLEANERIO_DOCKER_WORKSPACE_CONFIG} + - GLEANERIO_GLEANER_CONFIG_PATH=${GLEANERIO_GLEANER_CONFIG_PATH:-/configs/gleaner/gleanerconfig.yaml} - GLEANERIO_GLEANER_IMAGE=${GLEANERIO_GLEANER_IMAGE} - - GLEANERIO_NABU_IMAGE=${GLEANERIO_NABU_IMAGE} - - GLEANERIO_LOG_PREFIX=${GLEANERIO_LOG_PREFIX} - - GLEANERIO_SUMMARY_GRAPH_ENDPOINT=${GLEANERIO_SUMMARY_GRAPH_ENDPOINT} - - GLEANERIO_SUMMARY_GRAPH_NAMESPACE=${GLEANERIO_SUMMARY_GRAPH_NAMESPACE} + - GLEANERIO_GRAPH_NAMESPACE=${GLEANERIO_GRAPH_NAMESPACE} + - GLEANERIO_GRAPH_SUMMARIZE=${GLEANERIO_GRAPH_SUMMARIZE:-false} + - GLEANERIO_GRAPH_SUMMARY_ENDPOINT=${GLEANERIO_GRAPH_SUMMARY_ENDPOINT:-${GLEANERIO_GRAPH_URL}} + - GLEANERIO_GRAPH_SUMMARY_NAMESPACE=${GLEANERIO_GRAPH_SUMMARY_NAMESPACE} + - GLEANERIO_GRAPH_URL=${GLEANERIO_GRAPH_URL} + - GLEANERIO_HEADLESS_ENDPOINT=${GLEANERIO_HEADLESS_ENDPOINT} + - GLEANERIO_LOG_PREFIX=${GLEANERIO_LOG_PREFIX:-scheduler/logs/} + - GLEANERIO_MINIO_ACCESS_KEY=${GLEANERIO_MINIO_ACCESS_KEY} - GLEANERIO_MINIO_ADDRESS=${GLEANERIO_MINIO_ADDRESS} - - GLEANERIO_MINIO_PORT=${GLEANERIO_MINIO_PORT} - - GLEANERIO_MINIO_USE_SSL=${GLEANERIO_MINIO_USE_SSL} - GLEANERIO_MINIO_BUCKET=${GLEANERIO_MINIO_BUCKET} - - GLEANERIO_MINIO_ACCESS_KEY=${GLEANERIO_MINIO_ACCESS_KEY} + - GLEANERIO_MINIO_PORT=${GLEANERIO_MINIO_PORT} - GLEANERIO_MINIO_SECRET_KEY=${GLEANERIO_MINIO_SECRET_KEY} - - GLEANERIO_HEADLESS_ENDPOINT=${GLEANERIO_HEADLESS_ENDPOINT} - - GLEANERIO_HEADLESS_NETWORK=${GLEANERIO_HEADLESS_NETWORK} - - GLEANERIO_GRAPH_URL=${GLEANERIO_GRAPH_URL} - - GLEANERIO_GRAPH_NAMESPACE=${GLEANERIO_GRAPH_NAMESPACE} + - GLEANERIO_MINIO_USE_SSL=${GLEANERIO_MINIO_USE_SSL} - GLEANERIO_NABU_CONFIG_PATH=${GLEANERIO_NABU_CONFIG_PATH:-/configs/gleaner/nabuconfig.yaml} - - GLEANERIO_GLEANER_CONFIG_PATH=${GLEANERIO_GLEANER_CONFIG_PATH:-/configs/gleaner/gleanerconfig.yaml} - - GLEANERIO_NABU_DOCKER_CONFIG=${GLEANERIO_NABU_DOCKER_CONFIG:-nabu} - - GLEANERIO_GLEANER_DOCKER_CONFIG=${GLEANERIO_GLEANER_DOCKER_CONFIG:-gleaner} + - GLEANERIO_NABU_IMAGE=${GLEANERIO_NABU_IMAGE} + - GLEANERIO_PORTAINER_APIKEY=${GLEANERIO_PORTAINER_APIKEY} + - GLEANERIO_DEFAULT_SCHEDULE=${GLEANERIO_DEFAULT_SCHEDULE:-@weekly} + - GLEANERIO_DEFAULT_SCHEDULE_TIMEZONE=${GLEANERIO_DEFAULT_SCHEDULE_TIMEZONE:-"America/Los_Angeles"} + - GLEANERIO_SOURCES_FILENAME=${GLEANERIO_SOURCES_FILENAME:-gleanerconfig.yaml} + - GLEANERIO_TENANT_FILENAME=${GLEANERIO_TENANT_FILENAME:-tenant.yaml} + - GLEANERIO_WORKSPACE_CONFIG_PATH=${GLEANERIO_WORKSPACE_CONFIG_PATH} + - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + - SLACK_CHANNEL=${SLACK_CHANNEL:-"#twitterfeed"} + - SLACK_TOKEN=${SLACK_TOKEN} ports: - 3000:3000 @@ -96,7 +110,8 @@ services: - traefik_proxy depends_on: - dagster-postgres - - dagster-code-project +# - dagster-code-project + - dagster-code-ingest - dagster-code-tasks labels: - "traefik.enable=true" @@ -114,7 +129,7 @@ services: - "traefik.http.services.sched.loadbalancer.server.port=3000" - "traefik.docker.network=traefik_proxy" - "traefik.http.middlewares.sched.headers.accesscontrolallowmethods=GET,OPTIONS,POST" - - "traefik.http.middlewares.sched.headers.accesscontrolalloworigin=*" + - "traefik.http.middlewares.sched.headers.accessControlAllowOriginList=*" - "traefik.http.middlewares.sched.headers.accesscontrolmaxage=100" - "traefik.http.middlewares.sched.headers.addvaryheader=true" dagster-daemon: @@ -134,7 +149,8 @@ services: volumes: *vol depends_on: - dagster-postgres - - dagster-code-project +# - dagster-code-project + - dagster-code-ingest - dagster-code-tasks networks: - traefik_proxy @@ -192,14 +208,13 @@ services: - traefik_proxy - headless # in code, use names defined in network above - - dagster-code-tasks: + dagster-code-ingest: platform: linux/x86_64 build: #context: . context: .. - dockerfile: build/Dockerfile_code + dockerfile: build/Dockerfile_workflows args: implnet: ${PROJECT:-eco} # you should be able to change the source locally, without a full rebuild. @@ -216,7 +231,7 @@ services: - "-p" - "4000" - "-m" - - "workflows.tasks.tasks" + - "workflows.ingest.ingest" - "-d" - "/usr/src/app/" @@ -225,13 +240,14 @@ services: - dagster-postgres networks: - traefik_proxy - dagster-code-project: + + dagster-code-tasks: platform: linux/x86_64 build: #context: . context: .. - dockerfile: build/Dockerfile_code + dockerfile: build/Dockerfile_workflows args: implnet: ${PROJECT:-eco} # you should be able to change the source locally, without a full rebuild. @@ -247,13 +263,45 @@ services: - "0.0.0.0" - "-p" - "4000" - - "--python-file" - - "/usr/src/app/project/${PROJECT:-eco}/repositories/repository.py" + - "-m" + - "workflows.tasks.tasks" - "-d" - - "/usr/src/app/project/${PROJECT:-eco}/" + - "/usr/src/app/" volumes: *vol depends_on: - dagster-postgres networks: - traefik_proxy +# dagster-code-project: +# +# platform: linux/x86_64 +# build: +# #context: . +# context: .. +# dockerfile: build/Dockerfile_code +# args: +# implnet: ${PROJECT:-eco} +# # you should be able to change the source locally, without a full rebuild. +# #image: dagster-${PROJECT:-eco}:latest +# image: dagster-gleanerio-local:latest +# +# environment: *env +# command: +# - "dagster" +# - "api" +# - "grpc" +# - "-h" +# - "0.0.0.0" +# - "-p" +# - "4000" +# - "--python-file" +# - "/usr/src/app/project/${PROJECT:-eco}/repositories/repository.py" +# - "-d" +# - "/usr/src/app/project/${PROJECT:-eco}/" +# +# volumes: *vol +# depends_on: +# - dagster-postgres +# networks: +# - traefik_proxy diff --git a/dagster/implnets/deployment/compose_local_eco_override.yaml b/dagster/implnets/deployment/compose_local_eco_override.yaml index bb160303..4e39299c 100644 --- a/dagster/implnets/deployment/compose_local_eco_override.yaml +++ b/dagster/implnets/deployment/compose_local_eco_override.yaml @@ -12,7 +12,7 @@ services: volumes: &vol - ../configs/${PROJECT:-eco}/workspace.yaml:/usr/src/app/workspace.yaml - ../deployment/dagster.yaml:/usr/src/app/dagster.yaml - - ../generatedCode/implnet-${PROJECT:-eco}/output/:/usr/src/app/project/${PROJECT:-eco} + # - ../generatedCode/implnet-${PROJECT:-eco}/output/:/usr/src/app/project/${PROJECT:-eco} - ../workflows/:/usr/src/app/workflows # GLEANEERIO_ the environment variables for this stack, passed into containers # the variables passed into the containers varies due to inconsistent standards. @@ -20,35 +20,50 @@ services: # DO NOT RENAME THE FIRST PART, aka the container environment variable, # unless you sure what you are doing environment: &env - - DEBUG=${DEBUG:-false} - - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python - - PORTAINER_URL=${PORTAINER_URL} - - PORTAINER_KEY=${PORTAINER_KEY} + - DEBUG_CONTAINER=${DEBUG_CONTAINER:-false} + - GLEANERIO_CONFIG_PATH=${GLEANERIO_CONFIG_PATH:-scheduler/configs/test/} + - GLEANERIO_DAGSTER_CONFIG_PATH=${GLEANERIO_DAGSTER_CONFIG_PATH:-scheduler/logs/} + - GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT=${GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT:-300} + - GLEANERIO_DOCKER_GLEANER_CONFIG=${GLEANERIO_DOCKER_GLEANER_CONFIG:-gleaner} + - GLEANERIO_DOCKER_HEADLESS_NETWORK=${GLEANERIO_DOCKER_HEADLESS_NETWORK} + - GLEANERIO_DOCKER_NABU_CONFIG=${GLEANERIO_DOCKER_NABU_CONFIG:-nabu} + - GLEANERIO_DOCKER_URL=${GLEANERIO_DOCKER_URL} + - GLEANERIO_DOCKER_WORKSPACE_CONFIG=${GLEANERIO_DOCKER_WORKSPACE_CONFIG} + - GLEANERIO_GLEANER_CONFIG_PATH=${GLEANERIO_GLEANER_CONFIG_PATH:-/configs/gleaner/gleanerconfig.yaml} - GLEANERIO_GLEANER_IMAGE=${GLEANERIO_GLEANER_IMAGE} - - GLEANERIO_NABU_IMAGE=${GLEANERIO_NABU_IMAGE} - - GLEANERIO_LOG_PREFIX=${GLEANERIO_LOG_PREFIX} + - GLEANERIO_GRAPH_NAMESPACE=${GLEANERIO_GRAPH_NAMESPACE} + - GLEANERIO_GRAPH_SUMMARIZE=${GLEANERIO_GRAPH_SUMMARIZE:-false} + - GLEANERIO_GRAPH_SUMMARY_ENDPOINT=${GLEANERIO_GRAPH_SUMMARY_ENDPOINT:-${GLEANERIO_GRAPH_URL}} + - GLEANERIO_GRAPH_SUMMARY_NAMESPACE=${GLEANERIO_GRAPH_SUMMARY_NAMESPACE} + - GLEANERIO_GRAPH_URL=${GLEANERIO_GRAPH_URL} + - GLEANERIO_HEADLESS_ENDPOINT=${GLEANERIO_HEADLESS_ENDPOINT} + - GLEANERIO_LOG_PREFIX=${GLEANERIO_LOG_PREFIX:-scheduler/logs/} + - GLEANERIO_MINIO_ACCESS_KEY=${GLEANERIO_MINIO_ACCESS_KEY} - GLEANERIO_MINIO_ADDRESS=${GLEANERIO_MINIO_ADDRESS} - - GLEANERIO_MINIO_PORT=${GLEANERIO_MINIO_PORT} - - GLEANERIO_MINIO_USE_SSL=${GLEANERIO_MINIO_USE_SSL} - GLEANERIO_MINIO_BUCKET=${GLEANERIO_MINIO_BUCKET} - - GLEANERIO_MINIO_ACCESS_KEY=${GLEANERIO_MINIO_ACCESS_KEY} + - GLEANERIO_MINIO_PORT=${GLEANERIO_MINIO_PORT} - GLEANERIO_MINIO_SECRET_KEY=${GLEANERIO_MINIO_SECRET_KEY} - - GLEANERIO_HEADLESS_ENDPOINT=${GLEANERIO_HEADLESS_ENDPOINT} - - GLEANERIO_HEADLESS_NETWORK=${GLEANERIO_HEADLESS_NETWORK} - - GLEANERIO_GRAPH_URL=${GLEANERIO_GRAPH_URL} - - GLEANERIO_GRAPH_NAMESPACE=${GLEANERIO_GRAPH_NAMESPACE} + - GLEANERIO_MINIO_USE_SSL=${GLEANERIO_MINIO_USE_SSL} - GLEANERIO_NABU_CONFIG_PATH=${GLEANERIO_NABU_CONFIG_PATH:-/configs/gleaner/nabuconfig.yaml} - - GLEANERIO_GLEANER_CONFIG_PATH=${GLEANERIO_GLEANER_CONFIG_PATH:-/configs/gleaner/gleanerconfig.yaml} - - GLEANERIO_NABU_DOCKER_CONFIG=${GLEANERIO_NABU_DOCKER_CONFIG:-nabu} - - GLEANERIO_GLEANER_DOCKER_CONFIG=${GLEANERIO_GLEANER_DOCKER_CONFIG:-gleaner} - - ECRR_MINIO_BUCKET="ECRR" - - ECRR_GRAPH_NAMESPACE="ECRR" + - GLEANERIO_NABU_IMAGE=${GLEANERIO_NABU_IMAGE} + - GLEANERIO_PORTAINER_APIKEY=${GLEANERIO_PORTAINER_APIKEY} + - GLEANERIO_DEFAULT_SCHEDULE=${GLEANERIO_DEFAULT_SCHEDULE:-@weekly} + - GLEANERIO_DEFAULT_SCHEDULE_TIMEZONE=${GLEANERIO_DEFAULT_SCHEDULE_TIMEZONE:-"America/Los_Angeles"} + - GLEANERIO_SOURCES_FILENAME=${GLEANERIO_SOURCES_FILENAME:-gleanerconfig.yaml} + - GLEANERIO_TENANT_FILENAME=${GLEANERIO_TENANT_FILENAME:-tenant.yaml} + - GLEANERIO_WORKSPACE_CONFIG_PATH=${GLEANERIO_WORKSPACE_CONFIG_PATH} + - GLEANERIO_CSV_CONFIG_URL=${GLEANERIO_CSV_CONFIG_URL:-https://docs.google.com/spreadsheets/d/e/2PACX-1vTt_45dYd5LMFK9Qm_lCg6P7YxG-ae0GZEtrHMZmNbI-y5tVDd8ZLqnEeIAa-SVTSztejfZeN6xmRZF/pub?gid=1340502269&single=true&output=csv} + - ECRR_MINIO_BUCKET=${ECRR_MINIO_BUCKET} + - ECRR_GRAPH_NAMESPACE=${ECRR_GRAPH_NAMESPACE} + - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + - SLACK_CHANNEL=${SLACK_CHANNEL:-"#twitterfeed"} + - SLACK_TOKEN=${SLACK_TOKEN} # caution for a portainer additional file deply # portainer issue, merging commands may need to create a combined customised on command: 'dagster-webserver -w workspace.yaml -h "0.0.0.0" -p 3000' depends_on: &deps - dagster-postgres - - dagster-code-project + - dagster-code-ingest - dagster-code-tasks - dagster-code-eco-ecrr @@ -82,10 +97,10 @@ services: - "0.0.0.0" - "-p" - "4000" - - "--python-file" - - "/usr/src/app/workflows/ecrr/repositories/repository.py" + - "-m" + - "workflows.ecrr.ecrr" - "-d" - - "/usr/src/app/workflows/ecrr/" + - "/usr/src/app/" volumes: *vol depends_on: diff --git a/dagster/implnets/deployment/compose_local_iow_override.yaml b/dagster/implnets/deployment/compose_local_iow_override.yaml index 2179a294..faae1627 100644 --- a/dagster/implnets/deployment/compose_local_iow_override.yaml +++ b/dagster/implnets/deployment/compose_local_iow_override.yaml @@ -12,7 +12,7 @@ services: volumes: &vol - ../configs/${PROJECT:-iow}/workspace.yaml:/usr/src/app/workspace.yaml - ../deployment/dagster.yaml:/usr/src/app/dagster.yaml - - ../generatedCode/implnet-${PROJECT:-iow}/output/:/usr/src/app/project/${PROJECT:-iow} +# - ../generatedCode/implnet-${PROJECT:-iow}/output/:/usr/src/app/project/${PROJECT:-iow} - ../workflows/:/usr/src/app/workflows # GLEANEERIO_ the environment variables for this stack, passed into containers # the variables passed into the containers varies due to inconsistent standards. @@ -20,27 +20,40 @@ services: # DO NOT RENAME THE FIRST PART, aka the container environment variable, # unless you sure what you are doing environment: &env - - DEBUG=${DEBUG:-false} - - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python - - PORTAINER_URL=${PORTAINER_URL} - - PORTAINER_KEY=${PORTAINER_KEY} + - DEBUG_CONTAINER=${DEBUG_CONTAINER:-false} + - GLEANERIO_CONFIG_PATH=${GLEANERIO_CONFIG_PATH:-scheduler/configs/test/} + - GLEANERIO_DAGSTER_CONFIG_PATH=${GLEANERIO_DAGSTER_CONFIG_PATH:-scheduler/logs/} + - GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT=${GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT:-300} + - GLEANERIO_DOCKER_GLEANER_CONFIG=${GLEANERIO_DOCKER_GLEANER_CONFIG:-gleaner} + - GLEANERIO_DOCKER_HEADLESS_NETWORK=${GLEANERIO_DOCKER_HEADLESS_NETWORK} + - GLEANERIO_DOCKER_NABU_CONFIG=${GLEANERIO_DOCKER_NABU_CONFIG:-nabu} + - GLEANERIO_DOCKER_URL=${GLEANERIO_DOCKER_URL} + - GLEANERIO_DOCKER_WORKSPACE_CONFIG=${GLEANERIO_DOCKER_WORKSPACE_CONFIG} + - GLEANERIO_GLEANER_CONFIG_PATH=${GLEANERIO_GLEANER_CONFIG_PATH:-/configs/gleaner/gleanerconfig.yaml} - GLEANERIO_GLEANER_IMAGE=${GLEANERIO_GLEANER_IMAGE} - - GLEANERIO_NABU_IMAGE=${GLEANERIO_NABU_IMAGE} + - GLEANERIO_GRAPH_NAMESPACE=${GLEANERIO_GRAPH_NAMESPACE} + - GLEANERIO_GRAPH_SUMMARIZE=${GLEANERIO_GRAPH_SUMMARIZE:-false} + - GLEANERIO_GRAPH_SUMMARY_ENDPOINT=${GLEANERIO_GRAPH_SUMMARY_ENDPOINT:-${GLEANERIO_GRAPH_URL}} + - GLEANERIO_GRAPH_SUMMARY_NAMESPACE=${GLEANERIO_GRAPH_SUMMARY_NAMESPACE} + - GLEANERIO_GRAPH_URL=${GLEANERIO_GRAPH_URL} + - GLEANERIO_HEADLESS_ENDPOINT=${GLEANERIO_HEADLESS_ENDPOINT} + - GLEANERIO_LOG_PREFIX=${GLEANERIO_LOG_PREFIX:-scheduler/logs/} - GLEANERIO_LOG_PREFIX=${GLEANERIO_LOG_PREFIX} + - GLEANERIO_MINIO_ACCESS_KEY=${GLEANERIO_MINIO_ACCESS_KEY} - GLEANERIO_MINIO_ADDRESS=${GLEANERIO_MINIO_ADDRESS} - - GLEANERIO_MINIO_PORT=${GLEANERIO_MINIO_PORT} - - GLEANERIO_MINIO_USE_SSL=${GLEANERIO_MINIO_USE_SSL} - GLEANERIO_MINIO_BUCKET=${GLEANERIO_MINIO_BUCKET} - - GLEANERIO_MINIO_ACCESS_KEY=${GLEANERIO_MINIO_ACCESS_KEY} + - GLEANERIO_MINIO_PORT=${GLEANERIO_MINIO_PORT} - GLEANERIO_MINIO_SECRET_KEY=${GLEANERIO_MINIO_SECRET_KEY} - - GLEANERIO_HEADLESS_ENDPOINT=${GLEANERIO_HEADLESS_ENDPOINT} - - GLEANERIO_HEADLESS_NETWORK=${GLEANERIO_HEADLESS_NETWORK} - - GLEANERIO_GRAPH_URL=${GLEANERIO_GRAPH_URL} - - GLEANERIO_GRAPH_NAMESPACE=${GLEANERIO_GRAPH_NAMESPACE} + - GLEANERIO_MINIO_USE_SSL=${GLEANERIO_MINIO_USE_SSL} - GLEANERIO_NABU_CONFIG_PATH=${GLEANERIO_NABU_CONFIG_PATH:-/configs/gleaner/nabuconfig.yaml} - - GLEANERIO_GLEANER_CONFIG_PATH=${GLEANERIO_GLEANER_CONFIG_PATH:-/configs/gleaner/gleanerconfig.yaml} - - GLEANERIO_NABU_DOCKER_CONFIG=${GLEANERIO_NABU_DOCKER_CONFIG:-nabu} - - GLEANERIO_GLEANER_DOCKER_CONFIG=${GLEANERIO_GLEANER_DOCKER_CONFIG:-gleaner} + - GLEANERIO_NABU_IMAGE=${GLEANERIO_NABU_IMAGE} + - GLEANERIO_PORTAINER_APIKEY=${GLEANERIO_PORTAINER_APIKEY} + - GLEANERIO_SOURCES_FILENAME=${GLEANERIO_SOURCES_FILENAME:-gleanerconfig.yaml} + - GLEANERIO_TENANT_FILENAME=${GLEANERIO_TENANT_FILENAME:-tenant.yaml} + - GLEANERIO_WORKSPACE_CONFIG_PATH=${GLEANERIO_WORKSPACE_CONFIG_PATH} + - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + - SLACK_CHANNEL=${SLACK_CHANNEL:-"#twitterfeed"} + - SLACK_TOKEN=${SLACK_TOKEN} # - ECRR_MINIO_BUCKET="ECRR" # - ECRR_GRAPH_NAMESPACE="ECRR" # caution for a portainer additional file deply @@ -48,7 +61,7 @@ services: command: 'dagster-webserver -w workspace.yaml -h "0.0.0.0" -p 3000' depends_on: &deps - dagster-postgres - - dagster-code-project + - dagster-code-ingest - dagster-code-tasks # - dagster-code-iow-ecrr diff --git a/dagster/implnets/deployment/compose_project.yaml b/dagster/implnets/deployment/compose_project.yaml index 51e1d30b..81be7d32 100644 --- a/dagster/implnets/deployment/compose_project.yaml +++ b/dagster/implnets/deployment/compose_project.yaml @@ -25,18 +25,22 @@ networks: volumes: dagster-postgres: driver: local + dagster-storage: + driver: local # external so it could be shared accross docker swarms configs: - gleaner: - name: ${GLEANERIO_GLEANER_DOCKER_CONFIG:-gleaner} - external: true - nabu: - name: ${GLEANERIO_NABU_DOCKER_CONFIG:-nabu} - external: true +# gleaner: +# name: ${GLEANERIO_DOCKER_GLEANER_CONFIG:-gleaner} +# external: true +# nabu: +# name: ${GLEANERIO_DOCKER_NABU_CONFIG:-nabu} +# external: true workspace: - name: ${GLEANERIO_WORKSPACE_DOCKER_CONFIG:-workspace} + name: ${GLEANERIO_DOCKER_WORKSPACE_CONFIG:-workspace} + external: true + dagster: + name: ${GLEANERIO_DOCKER_DAGSTER_CONFIG:-dagster} external: true - secrets: MINIO_ROOT_ACCESS_KEY: external: true @@ -60,16 +64,22 @@ services: # gid: "103" mode: 0444 - - source: gleaner - target: /scheduler/gleanerconfig.yaml - mode: - 044 - - source: nabu - target: /scheduler/nabuconfig.yaml + - source: dagster + target: /usr/src/app/dagster.yaml # uid: "103" # gid: "103" mode: - 044 + 0444 +# - source: gleaner +# target: /scheduler/gleanerconfig.yaml +# mode: +# 044 +# - source: nabu +# target: /scheduler/nabuconfig.yaml +# # uid: "103" +# # gid: "103" +# mode: +# 044 secrets: - MINIO_ROOT_ACCESS_KEY - MINIO_ROOT_SECRET_KEY @@ -79,31 +89,43 @@ services: # DO NOT RENAME THE FIRST PART, aka the container environment variable, # unless you sure what you are doing environment: &env - - DEBUG=${DEBUG:-false} - - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python - - PORTAINER_URL=${PORTAINER_URL} - - PORTAINER_KEY=${PORTAINER_KEY} + # sort these in BBedit to make finding them easier + - DEBUG_CONTAINER=${DEBUG_CONTAINER:-false} + - GLEANERIO_CONFIG_PATH=${GLEANERIO_CONFIG_PATH:-scheduler/configs/test/} + - GLEANERIO_DAGSTER_CONFIG_PATH=${GLEANERIO_DAGSTER_CONFIG_PATH:-scheduler/logs/} + - GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT=${GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT:-300} + - GLEANERIO_DOCKER_GLEANER_CONFIG=${GLEANERIO_DOCKER_GLEANER_CONFIG:-gleaner} + - GLEANERIO_DOCKER_HEADLESS_NETWORK=${GLEANERIO_DOCKER_HEADLESS_NETWORK} + - GLEANERIO_DOCKER_NABU_CONFIG=${GLEANERIO_DOCKER_NABU_CONFIG:-nabu} + - GLEANERIO_DOCKER_URL=${GLEANERIO_DOCKER_URL} + - GLEANERIO_DOCKER_WORKSPACE_CONFIG=${GLEANERIO_DOCKER_WORKSPACE_CONFIG} + - GLEANERIO_GLEANER_CONFIG_PATH=${GLEANERIO_GLEANER_CONFIG_PATH:-/configs/gleaner/gleanerconfig.yaml} - GLEANERIO_GLEANER_IMAGE=${GLEANERIO_GLEANER_IMAGE} - - GLEANERIO_NABU_IMAGE=${GLEANERIO_NABU_IMAGE} - - GLEANERIO_LOG_PREFIX=${GLEANERIO_LOG_PREFIX} + - GLEANERIO_GRAPH_NAMESPACE=${GLEANERIO_GRAPH_NAMESPACE} + - GLEANERIO_GRAPH_SUMMARIZE=${GLEANERIO_GRAPH_SUMMARIZE:-false} + - GLEANERIO_GRAPH_SUMMARY_ENDPOINT=${GLEANERIO_GRAPH_SUMMARY_ENDPOINT:-${GLEANERIO_GRAPH_URL}} + - GLEANERIO_GRAPH_SUMMARY_NAMESPACE=${GLEANERIO_GRAPH_SUMMARY_NAMESPACE} + - GLEANERIO_GRAPH_URL=${GLEANERIO_GRAPH_URL} + - GLEANERIO_HEADLESS_ENDPOINT=${GLEANERIO_HEADLESS_ENDPOINT} + - GLEANERIO_LOG_PREFIX=${GLEANERIO_LOG_PREFIX:-scheduler/logs/} + - GLEANERIO_MINIO_ACCESS_KEY=${GLEANERIO_MINIO_ACCESS_KEY} - GLEANERIO_MINIO_ADDRESS=${GLEANERIO_MINIO_ADDRESS} - - GLEANERIO_MINIO_PORT=${GLEANERIO_MINIO_PORT} - - GLEANERIO_MINIO_USE_SSL=${GLEANERIO_MINIO_USE_SSL} - GLEANERIO_MINIO_BUCKET=${GLEANERIO_MINIO_BUCKET} - - GLEANERIO_MINIO_ACCESS_KEY=${GLEANERIO_MINIO_ACCESS_KEY} + - GLEANERIO_MINIO_PORT=${GLEANERIO_MINIO_PORT} - GLEANERIO_MINIO_SECRET_KEY=${GLEANERIO_MINIO_SECRET_KEY} - - GLEANERIO_HEADLESS_ENDPOINT=${GLEANERIO_HEADLESS_ENDPOINT} - - GLEANERIO_HEADLESS_NETWORK=${GLEANERIO_HEADLESS_NETWORK} - - GLEANERIO_GRAPH_URL=${GLEANERIO_GRAPH_URL} - - GLEANERIO_GRAPH_NAMESPACE=${GLEANERIO_GRAPH_NAMESPACE} + - GLEANERIO_MINIO_USE_SSL=${GLEANERIO_MINIO_USE_SSL} - GLEANERIO_NABU_CONFIG_PATH=${GLEANERIO_NABU_CONFIG_PATH:-/configs/gleaner/nabuconfig.yaml} - - GLEANERIO_GLEANER_CONFIG_PATH=${GLEANERIO_GLEANER_CONFIG_PATH:-/configs/gleaner/gleanerconfig.yaml} - - GLEANERIO_NABU_DOCKER_CONFIG=${GLEANERIO_NABU_DOCKER_CONFIG:-nabu} - - GLEANERIO_GLEANER_DOCKER_CONFIG=${GLEANERIO_GLEANER_DOCKER_CONFIG:-gleaner} - - GLEANERIO_SUMMARY_GRAPH_ENDPOINT=${GLEANERIO_SUMMARY_GRAPH_ENDPOINT} - - GLEANERIO_SUMMARY_GRAPH_NAMESPACE=${GLEANERIO_SUMMARY_GRAPH_NAMESPACE} -# - GLEANER_MINIO_KEY=/run/secrets/MINIO_ROOT_ACCESS_KEY -# - GLEANER_MINIO_SECRET=/run/secrets/MINIO_ROOT_SECRET_KEY + - GLEANERIO_NABU_IMAGE=${GLEANERIO_NABU_IMAGE} + - GLEANERIO_PORTAINER_APIKEY=${GLEANERIO_PORTAINER_APIKEY} + - GLEANERIO_DEFAULT_SCHEDULE=${GLEANERIO_DEFAULT_SCHEDULE:-@weekly} + - GLEANERIO_DEFAULT_SCHEDULE_TIMEZONE=${GLEANERIO_DEFAULT_SCHEDULE_TIMEZONE:-America/Los_Angeles} + - GLEANERIO_SOURCES_FILENAME=${GLEANERIO_SOURCES_FILENAME:-gleanerconfig.yaml} + - GLEANERIO_TENANT_FILENAME=${GLEANERIO_TENANT_FILENAME:-tenant.yaml} + - GLEANERIO_WORKSPACE_CONFIG_PATH=${GLEANERIO_WORKSPACE_CONFIG_PATH} + - GLEANERIO_CSV_CONFIG_URL=${GLEANERIO_CSV_CONFIG_URL:-https://docs.google.com/spreadsheets/d/e/2PACX-1vTt_45dYd5LMFK9Qm_lCg6P7YxG-ae0GZEtrHMZmNbI-y5tVDd8ZLqnEeIAa-SVTSztejfZeN6xmRZF/pub?gid=1340502269&single=true&output=csv} + - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + - SLACK_CHANNEL=${SLACK_CHANNEL:-"#twitterfeed"} + - SLACK_TOKEN=${SLACK_TOKEN} ports: - 3000:3000 @@ -116,7 +138,7 @@ services: - "traefik.enable=true" - "traefik.http.routers.sched-${PROJECT:-eco}.entrypoints=http" - "traefik.http.routers.sched-${PROJECT:-eco}.priority=13" - - "traefik.http.routers.sched-${PROJECT:-eco}.rule=Host(`sched.${HOST? HOST is required}`)" + - "traefik.http.routers.sched-${PROJECT:-eco}.rule=Host(`${SCHED_HOSTNAME:-sched}.${HOST? HOST is required}`)" - "traefik.http.middlewares.sched-https-redirect.redirectscheme.scheme=https" - "traefik.http.routers.sched-${PROJECT:-eco}.middlewares=sched-https-redirect" - "traefik.http.routers.sched-${PROJECT:-eco}-secure.entrypoints=https" @@ -128,7 +150,7 @@ services: - "traefik.http.services.sched-${PROJECT:-eco}.loadbalancer.server.port=3000" - "traefik.docker.network=traefik_proxy" - "traefik.http.middlewares.sched-${PROJECT:-eco}.headers.accesscontrolallowmethods=GET,OPTIONS,POST" - - "traefik.http.middlewares.sched-${PROJECT:-eco}.headers.accesscontrolalloworigin=*" + - "traefik.http.middlewares.sched-${PROJECT:-eco}.headers.accessControlAllowOriginList=*" - "traefik.http.middlewares.sched-${PROJECT:-eco}.headers.accesscontrolmaxage=100" - "traefik.http.middlewares.sched-${PROJECT:-eco}.headers.addvaryheader=true" dagster-daemon: @@ -202,11 +224,42 @@ services: - dagster_host - headless # in code, use names defined in network above + dagster-code-ingest: + +# build: +# #context: . +# context: .. +# dockerfile: build/Dockerfile_code +# args: +# implnet: ${PROJECT:-eco} + # you should be able to change the source locally, without a full rebuild. + #image: dagster-${PROJECT:-eco}:latest + image: docker.io/nsfearthcube/dagster-gleanerio-workflows:${CONTAINER_CODE_TAG:-latest} + environment: *env + command: + - "dagster" + - "api" + - "grpc" + - "-h" + - "0.0.0.0" + - "-p" + - "4000" + - "-m" + - "workflows.ingest.ingest" + - "-d" + - "/usr/src/app/" + + volumes: &codevol + - dagster-storage:/usr/src/app/storage + depends_on: + - dagster-postgres + networks: + - dagster_host dagster-code-tasks: # you should be able to change the source locally, without a full rebuild. #image: dagster-${PROJECT:-eco}:latest - image: docker.io/nsfearthcube/dagster-gleanerio-${PROJECT:-eco}:${CONTAINER_CODE_TAG:-latest} + image: docker.io/nsfearthcube/dagster-gleanerio-workflows:${CONTAINER_CODE_TAG:-latest} environment: *env command: @@ -222,7 +275,7 @@ services: - "-d" - "/usr/src/app/" - # volumes: *vol + volumes: *codevol depends_on: - dagster-postgres networks: @@ -249,4 +302,4 @@ services: depends_on: - dagster-postgres networks: - - dagster_host \ No newline at end of file + - dagster_host diff --git a/dagster/implnets/deployment/compose_project_eco_override.yaml b/dagster/implnets/deployment/compose_project_eco_override.yaml index 13ee8d52..98f93502 100644 --- a/dagster/implnets/deployment/compose_project_eco_override.yaml +++ b/dagster/implnets/deployment/compose_project_eco_override.yaml @@ -15,29 +15,43 @@ services: # DO NOT RENAME THE FIRST PART, aka the container environment variable, # unless you sure what you are doing environment: &env - - DEBUG=${DEBUG:-false} - - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python - - PORTAINER_URL=${PORTAINER_URL} - - PORTAINER_KEY=${PORTAINER_KEY} + - DEBUG_CONTAINER=${DEBUG_CONTAINER:-false} + - GLEANERIO_CONFIG_PATH=${GLEANERIO_CONFIG_PATH:-scheduler/configs/test/} + - GLEANERIO_DAGSTER_CONFIG_PATH=${GLEANERIO_DAGSTER_CONFIG_PATH:-scheduler/logs/} + - GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT=${GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT:-300} + - GLEANERIO_DOCKER_GLEANER_CONFIG=${GLEANERIO_DOCKER_GLEANER_CONFIG:-gleaner} + - GLEANERIO_DOCKER_HEADLESS_NETWORK=${GLEANERIO_DOCKER_HEADLESS_NETWORK} + - GLEANERIO_DOCKER_NABU_CONFIG=${GLEANERIO_DOCKER_NABU_CONFIG:-nabu} + - GLEANERIO_DOCKER_URL=${GLEANERIO_DOCKER_URL} + - GLEANERIO_DOCKER_WORKSPACE_CONFIG=${GLEANERIO_DOCKER_WORKSPACE_CONFIG} + - GLEANERIO_GLEANER_CONFIG_PATH=${GLEANERIO_GLEANER_CONFIG_PATH:-/configs/gleaner/gleanerconfig.yaml} - GLEANERIO_GLEANER_IMAGE=${GLEANERIO_GLEANER_IMAGE} - - GLEANERIO_NABU_IMAGE=${GLEANERIO_NABU_IMAGE} - - GLEANERIO_LOG_PREFIX=${GLEANERIO_LOG_PREFIX} + - GLEANERIO_GRAPH_SUMMARIZE=${GLEANERIO_GRAPH_SUMMARIZE:-false} + - GLEANERIO_GRAPH_SUMMARY_ENDPOINT=${GLEANERIO_GRAPH_SUMMARY_ENDPOINT:-${GLEANERIO_GRAPH_URL}} + - GLEANERIO_GRAPH_SUMMARY_NAMESPACE=${GLEANERIO_GRAPH_SUMMARY_NAMESPACE} + - GLEANERIO_GRAPH_URL=${GLEANERIO_GRAPH_URL} + - GLEANERIO_HEADLESS_ENDPOINT=${GLEANERIO_HEADLESS_ENDPOINT} + - GLEANERIO_LOG_PREFIX=${GLEANERIO_LOG_PREFIX:-scheduler/logs/} + - GLEANERIO_MINIO_ACCESS_KEY=${GLEANERIO_MINIO_ACCESS_KEY} - GLEANERIO_MINIO_ADDRESS=${GLEANERIO_MINIO_ADDRESS} - - GLEANERIO_MINIO_PORT=${GLEANERIO_MINIO_PORT} - - GLEANERIO_MINIO_USE_SSL=${GLEANERIO_MINIO_USE_SSL} - GLEANERIO_MINIO_BUCKET=${GLEANERIO_MINIO_BUCKET} - - GLEANERIO_MINIO_ACCESS_KEY=${GLEANERIO_MINIO_ACCESS_KEY} + - GLEANERIO_MINIO_PORT=${GLEANERIO_MINIO_PORT} - GLEANERIO_MINIO_SECRET_KEY=${GLEANERIO_MINIO_SECRET_KEY} - - GLEANERIO_HEADLESS_ENDPOINT=${GLEANERIO_HEADLESS_ENDPOINT} - - GLEANERIO_HEADLESS_NETWORK=${GLEANERIO_HEADLESS_NETWORK} - - GLEANERIO_GRAPH_URL=${GLEANERIO_GRAPH_URL} - - GLEANERIO_GRAPH_NAMESPACE=${GLEANERIO_GRAPH_NAMESPACE} + - GLEANERIO_MINIO_USE_SSL=${GLEANERIO_MINIO_USE_SSL} - GLEANERIO_NABU_CONFIG_PATH=${GLEANERIO_NABU_CONFIG_PATH:-/configs/gleaner/nabuconfig.yaml} - - GLEANERIO_GLEANER_CONFIG_PATH=${GLEANERIO_GLEANER_CONFIG_PATH:-/configs/gleaner/gleanerconfig.yaml} - - GLEANERIO_NABU_DOCKER_CONFIG=${GLEANERIO_NABU_DOCKER_CONFIG:-nabu} - - GLEANERIO_GLEANER_DOCKER_CONFIG=${GLEANERIO_GLEANER_DOCKER_CONFIG:-gleaner} - - ECRR_MINIO_BUCKET="ECRR" - - ECRR_GRAPH_NAMESPACE="ECRR" + - GLEANERIO_NABU_IMAGE=${GLEANERIO_NABU_IMAGE} + - GLEANERIO_PORTAINER_APIKEY=${GLEANERIO_PORTAINER_APIKEY} + - GLEANERIO_DEFAULT_SCHEDULE=${GLEANERIO_DEFAULT_SCHEDULE:-@weekly} + - GLEANERIO_DEFAULT_SCHEDULE_TIMEZONE=${GLEANERIO_DEFAULT_SCHEDULE_TIMEZONE:-America/Los_Angeles} + - GLEANERIO_SOURCES_FILENAME=${GLEANERIO_SOURCES_FILENAME:-gleanerconfig.yaml} + - GLEANERIO_TENANT_FILENAME=${GLEANERIO_TENANT_FILENAME:-tenant.yaml} + - GLEANERIO_WORKSPACE_CONFIG_PATH=${GLEANERIO_WORKSPACE_CONFIG_PATH} + - GLEANERIO_CSV_CONFIG_URL=${GLEANERIO_CSV_CONFIG_URL:-https://docs.google.com/spreadsheets/d/e/2PACX-1vTt_45dYd5LMFK9Qm_lCg6P7YxG-ae0GZEtrHMZmNbI-y5tVDd8ZLqnEeIAa-SVTSztejfZeN6xmRZF/pub?gid=1340502269&single=true&output=csv} + - ECRR_MINIO_BUCKET=${ECRR_MINIO_BUCKET} + - ECRR_GRAPH_NAMESPACE=${ECRR_GRAPH_NAMESPACE} + - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + - SLACK_CHANNEL=${SLACK_CHANNEL:-"#twitterfeed"} + - SLACK_TOKEN=${SLACK_TOKEN} # command: @@ -49,7 +63,7 @@ services: # - "-p" # - "3000" depends_on: &deps - - dagster-code-project + - dagster-code-ingest - dagster-code-tasks dagster-daemon: @@ -62,7 +76,7 @@ services: # - "workspace.yaml" depends_on: - dagster-postgres - - dagster-code-project + - dagster-code-ingest - dagster-code-tasks - dagster-code-eco-ecrr networks: @@ -72,7 +86,7 @@ services: dagster-code-eco-ecrr: # you should be able to change the source locally, without a full rebuild. #image: dagster-${PROJECT:-eco}:latest - image: docker.io/nsfearthcube/dagster-gleanerio-${PROJECT:-eco}:${CONTAINER_CODE_TAG:-latest} + image: docker.io/nsfearthcube/dagster-gleanerio-workflows:${CONTAINER_CODE_TAG:-latest} environment: *env command: @@ -83,12 +97,13 @@ services: - "0.0.0.0" - "-p" - "4000" - - "--python-file" - - "/usr/src/app/workflows/ecrr/repositories/repository.py" + - "-m" + - "workflows.ecrr.ecrr" - "-d" - - "/usr/src/app/workflows/ecrr/" + - "/usr/src/app/" - #volumes: *vol + volumes: + - dagster-storage:/usr/src/app/storage depends_on: - dagster-postgres networks: diff --git a/dagster/implnets/deployment/dagster.yaml b/dagster/implnets/deployment/dagster.yaml index 9ff78eed..0c726065 100644 --- a/dagster/implnets/deployment/dagster.yaml +++ b/dagster/implnets/deployment/dagster.yaml @@ -32,6 +32,7 @@ run_coordinator: module: dagster.core.run_coordinator class: QueuedRunCoordinator config: - max_concurrent_runs: 4 +# max_concurrent_runs: 4 + max_concurrent_runs: 2 telemetry: enabled: false diff --git a/dagster/implnets/deployment/dagster_localrun.sh b/dagster/implnets/deployment/dagster_localrun.sh index 0dc626b1..d51b585b 100755 --- a/dagster/implnets/deployment/dagster_localrun.sh +++ b/dagster/implnets/deployment/dagster_localrun.sh @@ -18,7 +18,9 @@ do ? ) helpFunction ;; # Print helpFunction in case parameter is non-existent esac done - +RED='\033[0;31m' +Yellow='\033[0;33m' +NC='\033[0m' if [ ! $envfile ] then @@ -32,7 +34,7 @@ if [ -f $envfile ] export $(sed '/^[ \t]*#/d' $envfile | sed '/^$/d' | xargs) else - echo "missing environment file. pass flag, or copy and edit file" + echo -e "${RED} missing environment file. pass flag, or copy and edit file${NC}" echo "cp envFile.env .env" echo "OR" echo "cp {yourenv}.env .env" @@ -43,24 +45,24 @@ fi ## need to docker (network|volume) ls | grep (traefik_proxy|traefik_proxy) before these calll ## or an error will be thrown #echo "This message is OK **Error response from daemon: network with name traefik_proxy already exists.** " -if [ "$(docker network ls | grep ${GLEANERIO_HEADLESS_NETWORK})" ] ; then - echo ${GLEANERIO_HEADLESS_NETWORK} netowrk exists; +if [ "$(docker network ls | grep ${GLEANERIO_DOCKER_HEADLESS_NETWORK})" ] ; then + echo ${GLEANERIO_DOCKER_HEADLESS_NETWORK} netowrk exists; else echo creating network if [ "$(docker info | grep Swarm | sed 's/Swarm: //g' | tr -d ' ')" == "inactive" ]; then echo Not Swarm - if `docker network create -d bridge --attachable ${GLEANERIO_HEADLESS_NETWORK}`; then - echo 'Created network ${GLEANERIO_HEADLESS_NETWORK}' + if `docker network create -d bridge --attachable ${GLEANERIO_DOCKER_HEADLESS_NETWORK}`; then + echo 'Created network ${GLEANERIO_DOCKER_HEADLESS_NETWORK}' else - echo "ERROR: *** Failed to create local network. " + echo -e "${RED}ERROR: *** Failed to create local network. ${NC}" exit 1 fi else echo Is Swarm - if `docker network create -d overlay --attachable ${GLEANERIO_HEADLESS_NETWORK}`; then - echo 'Created network ${GLEANERIO_HEADLESS_NETWORK}' + if `docker network create -d overlay --attachable ${GLEANERIO_DOCKER_HEADLESS_NETWORK}`; then + echo 'Created network ${GLEANERIO_DOCKER_HEADLESS_NETWORK}' else - echo "ERROR: *** Failed to create swarm network. " + echo -e "${RED}ERROR: *** Failed to create swarm network. ${NC}" exit 1 fi fi @@ -70,9 +72,10 @@ fi #echo NOTE: Verify that the traefik_proxy network SCOPE is swarm -docker volume create ${GLEANERIO_CONFIG_VOLUME:-dagster_gleaner_configs} - -echo DO NOT FORGET TO USE pygen/makefile REGNERATE THE CODE. +RED='\033[0;31m' +Yellow='\033[0;33m' +NC='\033[0m' +echo -e ${Yellow}DO NOT FORGET TO USE pygen/makefile REGNERATE THE CODE.${NC} echo run as detached: $detached @@ -89,5 +92,6 @@ if [ "$detached" = true ] else docker compose -p dagster --env-file $envfile -f compose_local.yaml $override_file up fi +echo -e ${Yellow}DO NOT FORGET TO USE pygen/makefile REGNERATE THE CODE.${NC} +echo -e ${Yellow}If gleaner@project_grpc shows in UI as not working, most likely, REGNERATE THE CODE.${NC} -echo DO NOT FORGET TO USE pygen/makefile REGNERATE THE CODE. diff --git a/dagster/implnets/deployment/dagster_setup_docker.sh b/dagster/implnets/deployment/dagster_setup_docker.sh index 8fd6af07..7ba2f351 100755 --- a/dagster/implnets/deployment/dagster_setup_docker.sh +++ b/dagster/implnets/deployment/dagster_setup_docker.sh @@ -41,25 +41,25 @@ fi ## need to docker (network|volume) ls | grep (traefik_proxy|traefik_proxy) before these calll ## or an error will be thrown #echo "This message is OK **Error response from daemon: network with name traefik_proxy already exists.** " -if [ "$(docker network ls | grep -${GLEANER_HEADLESS_NETWORK})" ] ; then - echo ${GLEANER_HEADLESS_NETWORK} netowrk exists; +if [ "$(docker network ls | grep -${GLEANERIO_DOCKER_HEADLESS_NETWORK})" ] ; then + echo ${GLEANERIO_DOCKER_HEADLESS_NETWORK} netowrk exists; else echo creating network if [ "$(docker info | grep Swarm | sed 's/Swarm: //g')" == "inactive" ]; then echo Not Swarm - if `docker network create -d bridge --attachable ${GLEANER_HEADLESS_NETWORK}`; then - echo 'Created network ${GLEANER_HEADLESS_NETWORK}' + if `docker network create -d bridge --attachable ${GLEANERIO_DOCKER_HEADLESS_NETWORK}`; then + echo 'Created network ${GLEANERIO_DOCKER_HEADLESS_NETWORK}' else echo "ERROR: *** Failed to create local network. " - exit 1 + # exit 1 fi else echo Is Swarm - if `docker network create -d overlay --attachable ${GLEANER_HEADLESS_NETWORK}`; then - echo 'Created network ${GLEANER_HEADLESS_NETWORK}' + if `docker network create -d overlay --attachable ${GLEANERIO_DOCKER_HEADLESS_NETWORK}`; then + echo 'Created network ${GLEANERIO_DOCKER_HEADLESS_NETWORK}' else echo "ERROR: *** Failed to create swarm network. " - exit 1 + #exit 1 fi fi @@ -67,9 +67,10 @@ fi #echo NOTE: Verify that the traefik_proxy network SCOPE is swarm -docker volume create ${GLEANER_CONFIG_VOLUME:-dagster_gleaner_configs} +echo added network ${GLEANERIO_DOCKER_HEADLESS_NETWORK} -echo added network ${GLEANER_HEADLESS_NETWORK} and volume ${GLEANER_CONFIG_VOLUME} +docker volume create dagster-postgres +docker volume create dagster-storage if [ "$(docker config ls | grep -${GLEANERIO_GLEANER_CONFIG_PATH})" ] ; then echo ${GLEANERIO_GLEANER_CONFIG_PATH} config exists; @@ -79,8 +80,9 @@ else if `docker config create gleaner-${PROJECT} ../configs/${PROJECT}/gleanerconfig.yaml`; then echo 'Created gleaner config gleaner-${PROJECT} ${GLEANERIO_GLEANER_CONFIG_PATH}' else - echo "ERROR: *** Failed to create config. " - exit 1 + echo "ERROR: *** Failed to create docker/potainer config. gleaner-${PROJECT} ${GLEANERIO_GLEANER_CONFIG_PATH}" + echo "see if config exists " + # exit 1 fi fi @@ -92,8 +94,9 @@ else if `docker config create nabu-${PROJECT} ../configs/${PROJECT}/nabuconfig.yaml`; then echo 'Created gleaner config nabu-${PROJECT} ${GLEANERIO_NABU_CONFIG_PATH}' else - echo "ERROR: *** Failed to create config. " - exit 1 + echo "ERROR: *** Failed to create create docker/potainer config. nabu-${PROJECT} ${GLEANERIO_NABU_CONFIG_PATH} " + echo "see if config exists " + # exit 1 fi fi @@ -105,7 +108,22 @@ else if `docker config create workspace-${PROJECT} ../configs/${PROJECT}/workspace.yaml`; then echo 'Created gleaner config workspace-${PROJECT} ${GLEANERIO_WORKSPACE_CONFIG_PATH}' else - echo "ERROR: *** Failed to create config. " - exit 1 + echo "ERROR: *** Failed to create create docker/potainer config. workspace-${PROJECT} ${GLEANERIO_WORKSPACE_CONFIG_PATH}" + echo "see if config exists " + # exit 1 + fi +fi + +if [ "$(docker config ls | grep -${GLEANERIO_WORKSPACE_CONFIG_PATH})" ] ; then + echo ${GLEANERIO_WORKSPACE_CONFIG_PATH} config exists; +else + echo creating config + + if `docker config create workspace-${PROJECT} ../configs/${PROJECT}/workspace.yaml`; then + echo 'Created gleaner config workspace-${PROJECT} ${GLEANERIO_WORKSPACE_CONFIG_PATH}' + else + echo "ERROR: *** Failed to create create docker/potainer config. workspace-${PROJECT} ${GLEANERIO_WORKSPACE_CONFIG_PATH}" + echo "see if config exists " + # exit 1 fi fi diff --git a/dagster/implnets/deployment/envFile.env b/dagster/implnets/deployment/envFile.env index 4fa2699d..994418e5 100644 --- a/dagster/implnets/deployment/envFile.env +++ b/dagster/implnets/deployment/envFile.env @@ -1,42 +1,72 @@ +DAGSTER_HOME=dagster/dagster_home +## PROJECT -- default 'eco' this is a 'TRAEFIK router name' use to run multiple copies of scheduler on a server +# originally used to generate code for a specific project +#PROJECT=test + +#PROJECT=eco +#PROJECT=iow +#PROJECT=oih ###### # Nabu and Gleaner configs need to be in docker configs ## docker config name GLEANER_GLEANER_DOCKER_CONFIG ## docker config name GLEANER_NABU_DOCKER_CONFIG # suggested DOCKER_CONFIG NAMING PATTERN (nabu||gleaner)-{PROJECT} ######## -GLEANERIO_GLEANER_DOCKER_CONFIG=gleaner-eco -GLEANERIO_NABU_DOCKER_CONFIG=nabu-eco +GLEANERIO_DOCKER_GLEANER_CONFIG=gleaner-eco +GLEANERIO_DOCKER_NABU_CONFIG=nabu-eco # ### # workspace for dagster #### GLEANERIO_WORKSPACE_CONFIG_PATH=/usr/src/app/workspace.yaml -GLEANERIO_WORKSPACE_DOCKER_CONFIG=workspace-eco +GLEANERIO_DOCKER_WORKSPACE_CONFIG=workspace-eco + +GLEANERIO_DOCKER_DAGSTER_CONFIG=dagster + + +DEBUG_CONTAINER=false + +#### HOST +# host base name for treafik. fixed to localhost:3000 when using compose_local. +HOST=localhost +# Applies only to compose_project.yaml runs + +# modify SCHED_HOSTNAME is you want to run more than one instance +# aka two different project havests for now. +SCHED_HOSTNAME=sched + +GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT=300 +# debugging set to 10 - 30 seconds +# DEFAULT SCHEDULE +# as defined by https://docs.dagster.io/concepts/partitions-schedules-sensors/schedules#basic-schedules +# "@hourly", "@daily", "@weekly", and "@monthly" +#GLEANERIO_DEFAULT_SCHEDULE=@weekly +#GLEANERIO_DEFAULT_SCHEDULE_TIMEZONE="America/Los_Angeles" +# the above a used as hard coded os.getenv(), so when changed, service needs to be restarted. -DEBUG=False -PROJECT=eco + +# tags for docker compose CONTAINER_CODE_TAG=latest CONTAINER_DAGSTER_TAG=latest -#PROJECT=iow -#PROJECT=oih -HOST=localhost + PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python -# port is required: https://portainer.{HOST}:443/api/endpoints/2/docker/ -PORTAINER_URL= -PORTAINER_KEY= +# port is required: https://portainer.{HOST}:443/api/endpoints/9/docker/ +# 9 is dataloader, 2 is aws-dev +GLEANERIO_DOCKER_URL=https://portainer.{HOST}:443/api/endpoints/9/docker/ +GLEANERIO_PORTAINER_APIKEY= # if running dagster-dev, then this needs to be set , # defaults to "/scheduler/gleanerconfig.yaml" which is path to config mounted in containers # when debugging generated code "../../../configs/eco/gleanerconfig.yaml" # when debugging code in workflows "../../configs/eco/gleanerconfig.yaml" -# DAGSTER_GLEANER_CONFIG_PATH=../../../configs/eco/gleanerconfig.yaml +GLEANERIO_DAGSTER_CONFIG_PATH=../../../configs/eco/gleanerconfig.yaml # Network -GLEANERIO_HEADLESS_NETWORK=headless_gleanerio +GLEANERIO_DOCKER_HEADLESS_NETWORK=headless_gleanerio ### GLEANER/NABU Dockers -GLEANERIO_GLEANER_IMAGE=nsfearthcube/gleaner:latest -GLEANERIO_NABU_IMAGE=nsfearthcube/nabu:latest +GLEANERIO_GLEANER_IMAGE=nsfearthcube/gleaner:dev_ec +GLEANERIO_NABU_IMAGE=nsfearthcube/nabu:dev_eco ## # path where configs are deployed/mounted @@ -56,10 +86,27 @@ GLEANERIO_MINIO_SECRET_KEY= GLEANERIO_HEADLESS_ENDPOINT=http://headless:9222 # just the base address, no namespace https://graph.geocodes-aws-dev.earthcube.org/blazegraph -GLEANERIO_GRAPH_URL= -GLEANERIO_GRAPH_NAMESPACE= +GLEANERIO_GRAPH_URL=https://graph.geocodes-aws.earthcube.org/blazegraph +GLEANERIO_GRAPH_NAMESPACE=earthcube + +# optional: GLEANERIO_GRAPH_SUMMARY_ENDPOINT defaults to GLEANERIO_GRAPH_URL +#GLEANERIO_GRAPH_SUMMARY_ENDPOINT=https://graph.geocodes-aws-dev.earthcube.org/blazegraph +GLEANERIO_GRAPH_SUMMARY_NAMESPACE=earthcube_summary +GLEANERIO_GRAPH_SUMMARIZE=True + +# where are the gleaner and tennant configurations +GLEANERIO_CONFIG_PATH=scheduler/configs/ +GLEANERIO_TENANT_FILENAME=tenant.yaml +GLEANERIO_SOURCES_FILENAME=gleanerconfig.yaml + +# ECO Custom variables for ecrr +ECRR_GRAPH_NAMESPACE=ecrr +ECRR_MINIO_BUCKET=ecrr + +# only a public slack channel works. DV has no permissions to create a new channel +SLACK_CHANNEL="#production_discussion" +#SLACK_CHANNEL="#twitterfeed" +SLACK_TOKEN= -# example: https://graph.geocodes.ncsa.illinois.edu/blazegraph/namespace/yyearthcube2/sparql -#graph endpoint will be GLEANERIO_GRAPH_URL -GLEANERIO_SUMMARY_GRAPH_NAMESPACE= +GLEANERIO_CSV_CONFIG_URL=https://docs.google.com/spreadsheets/d/e/2PACX-1vTt_45dYd5LMFK9Qm_lCg6P7YxG-ae0GZEtrHMZmNbI-y5tVDd8ZLqnEeIAa-SVTSztejfZeN6xmRZF/pub?gid=1340502269&single=true&output=csv diff --git a/dagster/implnets/generatedCode/NOTE_DAGSTER_CLI.md b/dagster/implnets/generatedCode/NOTE_DAGSTER_CLI.md new file mode 100644 index 00000000..0a897237 --- /dev/null +++ b/dagster/implnets/generatedCode/NOTE_DAGSTER_CLI.md @@ -0,0 +1,18 @@ + + +# RUNNING LOCALLY +* You need to point at a docker STACK, or portainer endpoint... A local workstation docker is usually not a STACK. +* set the ENV variables; I use the env file plugin in pycharm +* +`cd dagster/implnets/generatedCode/implnet-eco/output +python -m dagster dev ` + +## To run a job: +`cd dagster/implnets/generatedCode/implnet-eco/output +python -m dagster job execute -f jobs/implnet_jobs_ecrr_examples.py -j implnet_jobs_ecrr_examples` + +## also can just be dagster +cd dagster/implnets/generatedCode/implnet-eco/output +(do some magic env: eg `export $(sed '/^[ \t]*#/d' $envfile | sed '/^$/d' | xargs)` ) +* dagster job list -w workspace.yaml +* dagster job execute -f jobs/implnet_jobs_ecrr_examples.py -j implnet_jobs_ecrr_examples diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_amgeo.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_amgeo.py deleted file mode 100644 index b850c609..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_amgeo.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_amgeo import harvest_amgeo - -@job -def implnet_job_amgeo(): - harvest_amgeo() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_aquadocs.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_aquadocs.py deleted file mode 100644 index 8d384135..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_aquadocs.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_aquadocs import harvest_aquadocs - -@job -def implnet_job_aquadocs(): - harvest_aquadocs() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_bcodmo.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_bcodmo.py deleted file mode 100644 index d5db3109..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_bcodmo.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_bcodmo import harvest_bcodmo - -@job -def implnet_job_bcodmo(): - harvest_bcodmo() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_cchdo.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_cchdo.py deleted file mode 100644 index d8380c4c..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_cchdo.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cchdo import harvest_cchdo - -@job -def implnet_job_cchdo(): - harvest_cchdo() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_datadiscoverystudio.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_datadiscoverystudio.py deleted file mode 100644 index ec41e220..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_datadiscoverystudio.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_datadiscoverystudio import harvest_datadiscoverystudio - -@job -def implnet_job_datadiscoverystudio(): - harvest_datadiscoverystudio() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_designsafe.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_designsafe.py deleted file mode 100644 index 122ba25c..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_designsafe.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_designsafe import harvest_designsafe - -@job -def implnet_job_designsafe(): - harvest_designsafe() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_earthchem.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_earthchem.py deleted file mode 100644 index 31fdf4e1..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_earthchem.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_earthchem import harvest_earthchem - -@job -def implnet_job_earthchem(): - harvest_earthchem() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_ecrr_examples.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_ecrr_examples.py deleted file mode 100644 index cb51bb8d..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_ecrr_examples.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_ecrr_examples import harvest_ecrr_examples - -@job -def implnet_job_ecrr_examples(): - harvest_ecrr_examples() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_edi.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_edi.py deleted file mode 100644 index 1f1a7229..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_edi.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_edi import harvest_edi - -@job -def implnet_job_edi(): - harvest_edi() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_geocodes_demo_datasets.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_geocodes_demo_datasets.py deleted file mode 100644 index 144da333..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_geocodes_demo_datasets.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_geocodes_demo_datasets import harvest_geocodes_demo_datasets - -@job -def implnet_job_geocodes_demo_datasets(): - harvest_geocodes_demo_datasets() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_geocodes_examples.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_geocodes_examples.py deleted file mode 100644 index bf0435ac..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_geocodes_examples.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_geocodes_examples import harvest_geocodes_examples - -@job -def implnet_job_geocodes_examples(): - harvest_geocodes_examples() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_hydroshare.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_hydroshare.py deleted file mode 100644 index 515e329a..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_hydroshare.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_hydroshare import harvest_hydroshare - -@job -def implnet_job_hydroshare(): - harvest_hydroshare() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_iedadata.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_iedadata.py deleted file mode 100644 index 44478429..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_iedadata.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_iedadata import harvest_iedadata - -@job -def implnet_job_iedadata(): - harvest_iedadata() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_iris.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_iris.py deleted file mode 100644 index 2cf195c3..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_iris.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_iris import harvest_iris - -@job -def implnet_job_iris(): - harvest_iris() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_linkedearth.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_linkedearth.py deleted file mode 100644 index 57b17ed1..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_linkedearth.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_linkedearth import harvest_linkedearth - -@job -def implnet_job_linkedearth(): - harvest_linkedearth() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_lipdverse.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_lipdverse.py deleted file mode 100644 index 57daa163..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_lipdverse.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_lipdverse import harvest_lipdverse - -@job -def implnet_job_lipdverse(): - harvest_lipdverse() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_magic.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_magic.py deleted file mode 100644 index 7a8140d6..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_magic.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_magic import harvest_magic - -@job -def implnet_job_magic(): - harvest_magic() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_neon.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_neon.py deleted file mode 100644 index 403d9f90..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_neon.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_neon import harvest_neon - -@job -def implnet_job_neon(): - harvest_neon() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_neotomadb.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_neotomadb.py deleted file mode 100644 index 4197a852..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_neotomadb.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_neotomadb import harvest_neotomadb - -@job -def implnet_job_neotomadb(): - harvest_neotomadb() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_opencoredata.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_opencoredata.py deleted file mode 100644 index a70c00a4..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_opencoredata.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_opencoredata import harvest_opencoredata - -@job -def implnet_job_opencoredata(): - harvest_opencoredata() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_opentopography.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_opentopography.py deleted file mode 100644 index c59db610..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_opentopography.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_opentopography import harvest_opentopography - -@job -def implnet_job_opentopography(): - harvest_opentopography() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_r2r.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_r2r.py deleted file mode 100644 index ed4bbaae..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_r2r.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_r2r import harvest_r2r - -@job -def implnet_job_r2r(): - harvest_r2r() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_resource_registry.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_resource_registry.py deleted file mode 100644 index 582d1c9a..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_resource_registry.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_resource_registry import harvest_resource_registry - -@job -def implnet_job_resource_registry(): - harvest_resource_registry() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_ssdbiodp.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_ssdbiodp.py deleted file mode 100644 index 0c2a7d32..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_ssdbiodp.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_ssdbiodp import harvest_ssdbiodp - -@job -def implnet_job_ssdbiodp(): - harvest_ssdbiodp() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_ucar.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_ucar.py deleted file mode 100644 index 86289be2..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_ucar.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_ucar import harvest_ucar - -@job -def implnet_job_ucar(): - harvest_ucar() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_unavco.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_unavco.py deleted file mode 100644 index 5b339869..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_unavco.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_unavco import harvest_unavco - -@job -def implnet_job_unavco(): - harvest_unavco() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_unidata.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_unidata.py deleted file mode 100644 index 065cb671..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_unidata.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_unidata import harvest_unidata - -@job -def implnet_job_unidata(): - harvest_unidata() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_usapdc.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_usapdc.py deleted file mode 100644 index f4f13a75..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_usapdc.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_usapdc import harvest_usapdc - -@job -def implnet_job_usapdc(): - harvest_usapdc() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_wifire.py b/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_wifire.py deleted file mode 100644 index 10b38a00..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/jobs/implnet_jobs_wifire.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_wifire import harvest_wifire - -@job -def implnet_job_wifire(): - harvest_wifire() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_amgeo.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_amgeo.py deleted file mode 100644 index a0618d00..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_amgeo.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def amgeo_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def amgeo_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "amgeo") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def amgeo_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "amgeo") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def amgeo_nabuprov(context): - returned_value = gleanerio(context,("prov"), "amgeo") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def amgeo_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "amgeo") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def amgeo_naburelease(context): - returned_value = gleanerio(context,("release"), "amgeo") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def amgeo_uploadrelease(context): - returned_value = post_to_graph("amgeo", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def amgeo_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="amgeo") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "amgeo" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def amgeo_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="amgeo") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "amgeo" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def amgeo_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="amgeo") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "amgeo" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def amgeo_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="amgeo") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "amgeo" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def amgeo_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "amgeo" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def amgeo_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "amgeo" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def amgeo_upload_summarize(context): - returned_value = post_to_graph("amgeo",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="amgeo"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="amgeo" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_amgeo(): - containers = amgeo_getImage() - harvest = amgeo_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = amgeo_missingreport_s3(start=harvest) - report_idstat = amgeo_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = amgeo_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="amgeo") - load_release = amgeo_naburelease(start=harvest) - load_uploadrelease = amgeo_uploadrelease(start=load_release) - - load_prune = amgeo_nabu_prune(start=load_uploadrelease) - load_prov = amgeo_nabuprov(start=load_prune) - load_org = amgeo_nabuorg(start=load_prov) - - summarize = amgeo_summarize(start=load_uploadrelease) - upload_summarize = amgeo_upload_summarize(start=summarize) - -# run after load - report_msgraph = amgeo_missingreport_graph(start=summarize) - report_graph = amgeo_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_aquadocs.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_aquadocs.py deleted file mode 100644 index f4de1374..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_aquadocs.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def aquadocs_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def aquadocs_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "aquadocs") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def aquadocs_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "aquadocs") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def aquadocs_nabuprov(context): - returned_value = gleanerio(context,("prov"), "aquadocs") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def aquadocs_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "aquadocs") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def aquadocs_naburelease(context): - returned_value = gleanerio(context,("release"), "aquadocs") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def aquadocs_uploadrelease(context): - returned_value = post_to_graph("aquadocs", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def aquadocs_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="aquadocs") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "aquadocs" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def aquadocs_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="aquadocs") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "aquadocs" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def aquadocs_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="aquadocs") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "aquadocs" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def aquadocs_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="aquadocs") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "aquadocs" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def aquadocs_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "aquadocs" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def aquadocs_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "aquadocs" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def aquadocs_upload_summarize(context): - returned_value = post_to_graph("aquadocs",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="aquadocs"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="aquadocs" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_aquadocs(): - containers = aquadocs_getImage() - harvest = aquadocs_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = aquadocs_missingreport_s3(start=harvest) - report_idstat = aquadocs_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = aquadocs_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="aquadocs") - load_release = aquadocs_naburelease(start=harvest) - load_uploadrelease = aquadocs_uploadrelease(start=load_release) - - load_prune = aquadocs_nabu_prune(start=load_uploadrelease) - load_prov = aquadocs_nabuprov(start=load_prune) - load_org = aquadocs_nabuorg(start=load_prov) - - summarize = aquadocs_summarize(start=load_uploadrelease) - upload_summarize = aquadocs_upload_summarize(start=summarize) - -# run after load - report_msgraph = aquadocs_missingreport_graph(start=summarize) - report_graph = aquadocs_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_bcodmo.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_bcodmo.py deleted file mode 100644 index 92e2d858..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_bcodmo.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def bcodmo_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def bcodmo_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "bcodmo") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def bcodmo_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "bcodmo") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def bcodmo_nabuprov(context): - returned_value = gleanerio(context,("prov"), "bcodmo") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def bcodmo_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "bcodmo") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def bcodmo_naburelease(context): - returned_value = gleanerio(context,("release"), "bcodmo") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def bcodmo_uploadrelease(context): - returned_value = post_to_graph("bcodmo", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def bcodmo_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="bcodmo") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "bcodmo" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def bcodmo_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="bcodmo") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "bcodmo" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def bcodmo_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="bcodmo") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "bcodmo" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def bcodmo_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="bcodmo") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "bcodmo" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def bcodmo_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "bcodmo" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def bcodmo_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "bcodmo" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def bcodmo_upload_summarize(context): - returned_value = post_to_graph("bcodmo",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="bcodmo"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="bcodmo" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_bcodmo(): - containers = bcodmo_getImage() - harvest = bcodmo_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = bcodmo_missingreport_s3(start=harvest) - report_idstat = bcodmo_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = bcodmo_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="bcodmo") - load_release = bcodmo_naburelease(start=harvest) - load_uploadrelease = bcodmo_uploadrelease(start=load_release) - - load_prune = bcodmo_nabu_prune(start=load_uploadrelease) - load_prov = bcodmo_nabuprov(start=load_prune) - load_org = bcodmo_nabuorg(start=load_prov) - - summarize = bcodmo_summarize(start=load_uploadrelease) - upload_summarize = bcodmo_upload_summarize(start=summarize) - -# run after load - report_msgraph = bcodmo_missingreport_graph(start=summarize) - report_graph = bcodmo_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_cchdo.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_cchdo.py deleted file mode 100644 index b2a86804..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_cchdo.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cchdo_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cchdo_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cchdo") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cchdo_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cchdo") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cchdo_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cchdo") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cchdo_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cchdo") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cchdo_naburelease(context): - returned_value = gleanerio(context,("release"), "cchdo") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cchdo_uploadrelease(context): - returned_value = post_to_graph("cchdo", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cchdo_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="cchdo") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "cchdo" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cchdo_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="cchdo") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "cchdo" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cchdo_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="cchdo") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "cchdo" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cchdo_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="cchdo") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "cchdo" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cchdo_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "cchdo" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def cchdo_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "cchdo" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def cchdo_upload_summarize(context): - returned_value = post_to_graph("cchdo",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cchdo"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cchdo" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cchdo(): - containers = cchdo_getImage() - harvest = cchdo_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cchdo_missingreport_s3(start=harvest) - report_idstat = cchdo_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cchdo_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cchdo") - load_release = cchdo_naburelease(start=harvest) - load_uploadrelease = cchdo_uploadrelease(start=load_release) - - load_prune = cchdo_nabu_prune(start=load_uploadrelease) - load_prov = cchdo_nabuprov(start=load_prune) - load_org = cchdo_nabuorg(start=load_prov) - - summarize = cchdo_summarize(start=load_uploadrelease) - upload_summarize = cchdo_upload_summarize(start=summarize) - -# run after load - report_msgraph = cchdo_missingreport_graph(start=summarize) - report_graph = cchdo_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_datadiscoverystudio.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_datadiscoverystudio.py deleted file mode 100644 index 28d0129f..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_datadiscoverystudio.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def datadiscoverystudio_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def datadiscoverystudio_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "datadiscoverystudio") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def datadiscoverystudio_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "datadiscoverystudio") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def datadiscoverystudio_nabuprov(context): - returned_value = gleanerio(context,("prov"), "datadiscoverystudio") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def datadiscoverystudio_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "datadiscoverystudio") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def datadiscoverystudio_naburelease(context): - returned_value = gleanerio(context,("release"), "datadiscoverystudio") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def datadiscoverystudio_uploadrelease(context): - returned_value = post_to_graph("datadiscoverystudio", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def datadiscoverystudio_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="datadiscoverystudio") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "datadiscoverystudio" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def datadiscoverystudio_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="datadiscoverystudio") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "datadiscoverystudio" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def datadiscoverystudio_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="datadiscoverystudio") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "datadiscoverystudio" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def datadiscoverystudio_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="datadiscoverystudio") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "datadiscoverystudio" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def datadiscoverystudio_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "datadiscoverystudio" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def datadiscoverystudio_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "datadiscoverystudio" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def datadiscoverystudio_upload_summarize(context): - returned_value = post_to_graph("datadiscoverystudio",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="datadiscoverystudio"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="datadiscoverystudio" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_datadiscoverystudio(): - containers = datadiscoverystudio_getImage() - harvest = datadiscoverystudio_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = datadiscoverystudio_missingreport_s3(start=harvest) - report_idstat = datadiscoverystudio_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = datadiscoverystudio_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="datadiscoverystudio") - load_release = datadiscoverystudio_naburelease(start=harvest) - load_uploadrelease = datadiscoverystudio_uploadrelease(start=load_release) - - load_prune = datadiscoverystudio_nabu_prune(start=load_uploadrelease) - load_prov = datadiscoverystudio_nabuprov(start=load_prune) - load_org = datadiscoverystudio_nabuorg(start=load_prov) - - summarize = datadiscoverystudio_summarize(start=load_uploadrelease) - upload_summarize = datadiscoverystudio_upload_summarize(start=summarize) - -# run after load - report_msgraph = datadiscoverystudio_missingreport_graph(start=summarize) - report_graph = datadiscoverystudio_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_designsafe.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_designsafe.py deleted file mode 100644 index 2859edf6..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_designsafe.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def designsafe_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def designsafe_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "designsafe") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def designsafe_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "designsafe") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def designsafe_nabuprov(context): - returned_value = gleanerio(context,("prov"), "designsafe") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def designsafe_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "designsafe") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def designsafe_naburelease(context): - returned_value = gleanerio(context,("release"), "designsafe") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def designsafe_uploadrelease(context): - returned_value = post_to_graph("designsafe", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def designsafe_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="designsafe") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "designsafe" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def designsafe_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="designsafe") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "designsafe" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def designsafe_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="designsafe") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "designsafe" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def designsafe_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="designsafe") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "designsafe" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def designsafe_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "designsafe" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def designsafe_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "designsafe" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def designsafe_upload_summarize(context): - returned_value = post_to_graph("designsafe",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="designsafe"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="designsafe" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_designsafe(): - containers = designsafe_getImage() - harvest = designsafe_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = designsafe_missingreport_s3(start=harvest) - report_idstat = designsafe_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = designsafe_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="designsafe") - load_release = designsafe_naburelease(start=harvest) - load_uploadrelease = designsafe_uploadrelease(start=load_release) - - load_prune = designsafe_nabu_prune(start=load_uploadrelease) - load_prov = designsafe_nabuprov(start=load_prune) - load_org = designsafe_nabuorg(start=load_prov) - - summarize = designsafe_summarize(start=load_uploadrelease) - upload_summarize = designsafe_upload_summarize(start=summarize) - -# run after load - report_msgraph = designsafe_missingreport_graph(start=summarize) - report_graph = designsafe_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_earthchem.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_earthchem.py deleted file mode 100644 index b9596fbb..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_earthchem.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def earthchem_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def earthchem_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "earthchem") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def earthchem_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "earthchem") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def earthchem_nabuprov(context): - returned_value = gleanerio(context,("prov"), "earthchem") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def earthchem_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "earthchem") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def earthchem_naburelease(context): - returned_value = gleanerio(context,("release"), "earthchem") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def earthchem_uploadrelease(context): - returned_value = post_to_graph("earthchem", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def earthchem_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="earthchem") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "earthchem" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def earthchem_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="earthchem") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "earthchem" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def earthchem_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="earthchem") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "earthchem" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def earthchem_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="earthchem") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "earthchem" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def earthchem_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "earthchem" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def earthchem_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "earthchem" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def earthchem_upload_summarize(context): - returned_value = post_to_graph("earthchem",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="earthchem"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="earthchem" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_earthchem(): - containers = earthchem_getImage() - harvest = earthchem_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = earthchem_missingreport_s3(start=harvest) - report_idstat = earthchem_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = earthchem_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="earthchem") - load_release = earthchem_naburelease(start=harvest) - load_uploadrelease = earthchem_uploadrelease(start=load_release) - - load_prune = earthchem_nabu_prune(start=load_uploadrelease) - load_prov = earthchem_nabuprov(start=load_prune) - load_org = earthchem_nabuorg(start=load_prov) - - summarize = earthchem_summarize(start=load_uploadrelease) - upload_summarize = earthchem_upload_summarize(start=summarize) - -# run after load - report_msgraph = earthchem_missingreport_graph(start=summarize) - report_graph = earthchem_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_geocodes_demo_datasets.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_geocodes_demo_datasets.py deleted file mode 100644 index 0532ae5a..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_geocodes_demo_datasets.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def geocodes_demo_datasets_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def geocodes_demo_datasets_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "geocodes_demo_datasets") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def geocodes_demo_datasets_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "geocodes_demo_datasets") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def geocodes_demo_datasets_nabuprov(context): - returned_value = gleanerio(context,("prov"), "geocodes_demo_datasets") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def geocodes_demo_datasets_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "geocodes_demo_datasets") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def geocodes_demo_datasets_naburelease(context): - returned_value = gleanerio(context,("release"), "geocodes_demo_datasets") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def geocodes_demo_datasets_uploadrelease(context): - returned_value = post_to_graph("geocodes_demo_datasets", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def geocodes_demo_datasets_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="geocodes_demo_datasets") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "geocodes_demo_datasets" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def geocodes_demo_datasets_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="geocodes_demo_datasets") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "geocodes_demo_datasets" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def geocodes_demo_datasets_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="geocodes_demo_datasets") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "geocodes_demo_datasets" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def geocodes_demo_datasets_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="geocodes_demo_datasets") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "geocodes_demo_datasets" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def geocodes_demo_datasets_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "geocodes_demo_datasets" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def geocodes_demo_datasets_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "geocodes_demo_datasets" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def geocodes_demo_datasets_upload_summarize(context): - returned_value = post_to_graph("geocodes_demo_datasets",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="geocodes_demo_datasets"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="geocodes_demo_datasets" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_geocodes_demo_datasets(): - containers = geocodes_demo_datasets_getImage() - harvest = geocodes_demo_datasets_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = geocodes_demo_datasets_missingreport_s3(start=harvest) - report_idstat = geocodes_demo_datasets_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = geocodes_demo_datasets_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="geocodes_demo_datasets") - load_release = geocodes_demo_datasets_naburelease(start=harvest) - load_uploadrelease = geocodes_demo_datasets_uploadrelease(start=load_release) - - load_prune = geocodes_demo_datasets_nabu_prune(start=load_uploadrelease) - load_prov = geocodes_demo_datasets_nabuprov(start=load_prune) - load_org = geocodes_demo_datasets_nabuorg(start=load_prov) - - summarize = geocodes_demo_datasets_summarize(start=load_uploadrelease) - upload_summarize = geocodes_demo_datasets_upload_summarize(start=summarize) - -# run after load - report_msgraph = geocodes_demo_datasets_missingreport_graph(start=summarize) - report_graph = geocodes_demo_datasets_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_geocodes_examples.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_geocodes_examples.py deleted file mode 100644 index 9aad7ec4..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_geocodes_examples.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def geocodes_examples_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def geocodes_examples_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "geocodes_examples") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def geocodes_examples_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "geocodes_examples") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def geocodes_examples_nabuprov(context): - returned_value = gleanerio(context,("prov"), "geocodes_examples") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def geocodes_examples_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "geocodes_examples") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def geocodes_examples_naburelease(context): - returned_value = gleanerio(context,("release"), "geocodes_examples") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def geocodes_examples_uploadrelease(context): - returned_value = post_to_graph("geocodes_examples", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def geocodes_examples_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="geocodes_examples") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "geocodes_examples" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def geocodes_examples_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="geocodes_examples") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "geocodes_examples" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def geocodes_examples_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="geocodes_examples") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "geocodes_examples" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def geocodes_examples_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="geocodes_examples") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "geocodes_examples" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def geocodes_examples_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "geocodes_examples" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def geocodes_examples_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "geocodes_examples" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def geocodes_examples_upload_summarize(context): - returned_value = post_to_graph("geocodes_examples",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="geocodes_examples"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="geocodes_examples" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_geocodes_examples(): - containers = geocodes_examples_getImage() - harvest = geocodes_examples_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = geocodes_examples_missingreport_s3(start=harvest) - report_idstat = geocodes_examples_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = geocodes_examples_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="geocodes_examples") - load_release = geocodes_examples_naburelease(start=harvest) - load_uploadrelease = geocodes_examples_uploadrelease(start=load_release) - - load_prune = geocodes_examples_nabu_prune(start=load_uploadrelease) - load_prov = geocodes_examples_nabuprov(start=load_prune) - load_org = geocodes_examples_nabuorg(start=load_prov) - - summarize = geocodes_examples_summarize(start=load_uploadrelease) - upload_summarize = geocodes_examples_upload_summarize(start=summarize) - -# run after load - report_msgraph = geocodes_examples_missingreport_graph(start=summarize) - report_graph = geocodes_examples_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_hydroshare.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_hydroshare.py deleted file mode 100644 index 8adcd2c9..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_hydroshare.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def hydroshare_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def hydroshare_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "hydroshare") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hydroshare_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "hydroshare") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hydroshare_nabuprov(context): - returned_value = gleanerio(context,("prov"), "hydroshare") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hydroshare_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "hydroshare") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hydroshare_naburelease(context): - returned_value = gleanerio(context,("release"), "hydroshare") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hydroshare_uploadrelease(context): - returned_value = post_to_graph("hydroshare", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def hydroshare_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="hydroshare") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "hydroshare" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hydroshare_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="hydroshare") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "hydroshare" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hydroshare_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="hydroshare") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "hydroshare" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hydroshare_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="hydroshare") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "hydroshare" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hydroshare_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "hydroshare" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def hydroshare_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "hydroshare" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def hydroshare_upload_summarize(context): - returned_value = post_to_graph("hydroshare",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="hydroshare"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="hydroshare" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_hydroshare(): - containers = hydroshare_getImage() - harvest = hydroshare_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = hydroshare_missingreport_s3(start=harvest) - report_idstat = hydroshare_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = hydroshare_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="hydroshare") - load_release = hydroshare_naburelease(start=harvest) - load_uploadrelease = hydroshare_uploadrelease(start=load_release) - - load_prune = hydroshare_nabu_prune(start=load_uploadrelease) - load_prov = hydroshare_nabuprov(start=load_prune) - load_org = hydroshare_nabuorg(start=load_prov) - - summarize = hydroshare_summarize(start=load_uploadrelease) - upload_summarize = hydroshare_upload_summarize(start=summarize) - -# run after load - report_msgraph = hydroshare_missingreport_graph(start=summarize) - report_graph = hydroshare_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_iedadata.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_iedadata.py deleted file mode 100644 index 150ac2de..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_iedadata.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def iedadata_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def iedadata_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "iedadata") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def iedadata_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "iedadata") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def iedadata_nabuprov(context): - returned_value = gleanerio(context,("prov"), "iedadata") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def iedadata_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "iedadata") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def iedadata_naburelease(context): - returned_value = gleanerio(context,("release"), "iedadata") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def iedadata_uploadrelease(context): - returned_value = post_to_graph("iedadata", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def iedadata_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="iedadata") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "iedadata" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def iedadata_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="iedadata") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "iedadata" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def iedadata_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="iedadata") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "iedadata" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def iedadata_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="iedadata") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "iedadata" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def iedadata_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "iedadata" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def iedadata_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "iedadata" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def iedadata_upload_summarize(context): - returned_value = post_to_graph("iedadata",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="iedadata"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="iedadata" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_iedadata(): - containers = iedadata_getImage() - harvest = iedadata_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = iedadata_missingreport_s3(start=harvest) - report_idstat = iedadata_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = iedadata_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="iedadata") - load_release = iedadata_naburelease(start=harvest) - load_uploadrelease = iedadata_uploadrelease(start=load_release) - - load_prune = iedadata_nabu_prune(start=load_uploadrelease) - load_prov = iedadata_nabuprov(start=load_prune) - load_org = iedadata_nabuorg(start=load_prov) - - summarize = iedadata_summarize(start=load_uploadrelease) - upload_summarize = iedadata_upload_summarize(start=summarize) - -# run after load - report_msgraph = iedadata_missingreport_graph(start=summarize) - report_graph = iedadata_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_iris.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_iris.py deleted file mode 100644 index e10f7586..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_iris.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def iris_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def iris_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "iris") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def iris_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "iris") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def iris_nabuprov(context): - returned_value = gleanerio(context,("prov"), "iris") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def iris_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "iris") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def iris_naburelease(context): - returned_value = gleanerio(context,("release"), "iris") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def iris_uploadrelease(context): - returned_value = post_to_graph("iris", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def iris_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="iris") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "iris" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def iris_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="iris") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "iris" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def iris_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="iris") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "iris" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def iris_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="iris") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "iris" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def iris_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "iris" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def iris_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "iris" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def iris_upload_summarize(context): - returned_value = post_to_graph("iris",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="iris"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="iris" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_iris(): - containers = iris_getImage() - harvest = iris_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = iris_missingreport_s3(start=harvest) - report_idstat = iris_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = iris_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="iris") - load_release = iris_naburelease(start=harvest) - load_uploadrelease = iris_uploadrelease(start=load_release) - - load_prune = iris_nabu_prune(start=load_uploadrelease) - load_prov = iris_nabuprov(start=load_prune) - load_org = iris_nabuorg(start=load_prov) - - summarize = iris_summarize(start=load_uploadrelease) - upload_summarize = iris_upload_summarize(start=summarize) - -# run after load - report_msgraph = iris_missingreport_graph(start=summarize) - report_graph = iris_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_linkedearth.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_linkedearth.py deleted file mode 100644 index df495780..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_linkedearth.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def linkedearth_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def linkedearth_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "linkedearth") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def linkedearth_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "linkedearth") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def linkedearth_nabuprov(context): - returned_value = gleanerio(context,("prov"), "linkedearth") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def linkedearth_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "linkedearth") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def linkedearth_naburelease(context): - returned_value = gleanerio(context,("release"), "linkedearth") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def linkedearth_uploadrelease(context): - returned_value = post_to_graph("linkedearth", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def linkedearth_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="linkedearth") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "linkedearth" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def linkedearth_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="linkedearth") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "linkedearth" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def linkedearth_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="linkedearth") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "linkedearth" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def linkedearth_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="linkedearth") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "linkedearth" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def linkedearth_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "linkedearth" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def linkedearth_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "linkedearth" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def linkedearth_upload_summarize(context): - returned_value = post_to_graph("linkedearth",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="linkedearth"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="linkedearth" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_linkedearth(): - containers = linkedearth_getImage() - harvest = linkedearth_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = linkedearth_missingreport_s3(start=harvest) - report_idstat = linkedearth_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = linkedearth_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="linkedearth") - load_release = linkedearth_naburelease(start=harvest) - load_uploadrelease = linkedearth_uploadrelease(start=load_release) - - load_prune = linkedearth_nabu_prune(start=load_uploadrelease) - load_prov = linkedearth_nabuprov(start=load_prune) - load_org = linkedearth_nabuorg(start=load_prov) - - summarize = linkedearth_summarize(start=load_uploadrelease) - upload_summarize = linkedearth_upload_summarize(start=summarize) - -# run after load - report_msgraph = linkedearth_missingreport_graph(start=summarize) - report_graph = linkedearth_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_lipdverse.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_lipdverse.py deleted file mode 100644 index 020a942c..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_lipdverse.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def lipdverse_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def lipdverse_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "lipdverse") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def lipdverse_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "lipdverse") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def lipdverse_nabuprov(context): - returned_value = gleanerio(context,("prov"), "lipdverse") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def lipdverse_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "lipdverse") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def lipdverse_naburelease(context): - returned_value = gleanerio(context,("release"), "lipdverse") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def lipdverse_uploadrelease(context): - returned_value = post_to_graph("lipdverse", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def lipdverse_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="lipdverse") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "lipdverse" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def lipdverse_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="lipdverse") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "lipdverse" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def lipdverse_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="lipdverse") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "lipdverse" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def lipdverse_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="lipdverse") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "lipdverse" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def lipdverse_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "lipdverse" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def lipdverse_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "lipdverse" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def lipdverse_upload_summarize(context): - returned_value = post_to_graph("lipdverse",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="lipdverse"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="lipdverse" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_lipdverse(): - containers = lipdverse_getImage() - harvest = lipdverse_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = lipdverse_missingreport_s3(start=harvest) - report_idstat = lipdverse_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = lipdverse_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="lipdverse") - load_release = lipdverse_naburelease(start=harvest) - load_uploadrelease = lipdverse_uploadrelease(start=load_release) - - load_prune = lipdverse_nabu_prune(start=load_uploadrelease) - load_prov = lipdverse_nabuprov(start=load_prune) - load_org = lipdverse_nabuorg(start=load_prov) - - summarize = lipdverse_summarize(start=load_uploadrelease) - upload_summarize = lipdverse_upload_summarize(start=summarize) - -# run after load - report_msgraph = lipdverse_missingreport_graph(start=summarize) - report_graph = lipdverse_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_magic.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_magic.py deleted file mode 100644 index 2ade2cd2..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_magic.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def magic_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def magic_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "magic") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def magic_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "magic") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def magic_nabuprov(context): - returned_value = gleanerio(context,("prov"), "magic") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def magic_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "magic") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def magic_naburelease(context): - returned_value = gleanerio(context,("release"), "magic") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def magic_uploadrelease(context): - returned_value = post_to_graph("magic", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def magic_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="magic") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "magic" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def magic_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="magic") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "magic" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def magic_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="magic") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "magic" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def magic_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="magic") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "magic" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def magic_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "magic" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def magic_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "magic" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def magic_upload_summarize(context): - returned_value = post_to_graph("magic",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="magic"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="magic" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_magic(): - containers = magic_getImage() - harvest = magic_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = magic_missingreport_s3(start=harvest) - report_idstat = magic_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = magic_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="magic") - load_release = magic_naburelease(start=harvest) - load_uploadrelease = magic_uploadrelease(start=load_release) - - load_prune = magic_nabu_prune(start=load_uploadrelease) - load_prov = magic_nabuprov(start=load_prune) - load_org = magic_nabuorg(start=load_prov) - - summarize = magic_summarize(start=load_uploadrelease) - upload_summarize = magic_upload_summarize(start=summarize) - -# run after load - report_msgraph = magic_missingreport_graph(start=summarize) - report_graph = magic_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_neon.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_neon.py deleted file mode 100644 index b8376c16..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_neon.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def neon_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def neon_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "neon") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def neon_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "neon") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def neon_nabuprov(context): - returned_value = gleanerio(context,("prov"), "neon") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def neon_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "neon") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def neon_naburelease(context): - returned_value = gleanerio(context,("release"), "neon") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def neon_uploadrelease(context): - returned_value = post_to_graph("neon", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def neon_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="neon") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "neon" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def neon_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="neon") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "neon" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def neon_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="neon") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "neon" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def neon_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="neon") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "neon" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def neon_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "neon" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def neon_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "neon" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def neon_upload_summarize(context): - returned_value = post_to_graph("neon",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="neon"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="neon" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_neon(): - containers = neon_getImage() - harvest = neon_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = neon_missingreport_s3(start=harvest) - report_idstat = neon_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = neon_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="neon") - load_release = neon_naburelease(start=harvest) - load_uploadrelease = neon_uploadrelease(start=load_release) - - load_prune = neon_nabu_prune(start=load_uploadrelease) - load_prov = neon_nabuprov(start=load_prune) - load_org = neon_nabuorg(start=load_prov) - - summarize = neon_summarize(start=load_uploadrelease) - upload_summarize = neon_upload_summarize(start=summarize) - -# run after load - report_msgraph = neon_missingreport_graph(start=summarize) - report_graph = neon_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_neotomadb.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_neotomadb.py deleted file mode 100644 index cb268e7d..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_neotomadb.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def neotomadb_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def neotomadb_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "neotomadb") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def neotomadb_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "neotomadb") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def neotomadb_nabuprov(context): - returned_value = gleanerio(context,("prov"), "neotomadb") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def neotomadb_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "neotomadb") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def neotomadb_naburelease(context): - returned_value = gleanerio(context,("release"), "neotomadb") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def neotomadb_uploadrelease(context): - returned_value = post_to_graph("neotomadb", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def neotomadb_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="neotomadb") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "neotomadb" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def neotomadb_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="neotomadb") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "neotomadb" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def neotomadb_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="neotomadb") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "neotomadb" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def neotomadb_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="neotomadb") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "neotomadb" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def neotomadb_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "neotomadb" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def neotomadb_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "neotomadb" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def neotomadb_upload_summarize(context): - returned_value = post_to_graph("neotomadb",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="neotomadb"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="neotomadb" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_neotomadb(): - containers = neotomadb_getImage() - harvest = neotomadb_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = neotomadb_missingreport_s3(start=harvest) - report_idstat = neotomadb_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = neotomadb_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="neotomadb") - load_release = neotomadb_naburelease(start=harvest) - load_uploadrelease = neotomadb_uploadrelease(start=load_release) - - load_prune = neotomadb_nabu_prune(start=load_uploadrelease) - load_prov = neotomadb_nabuprov(start=load_prune) - load_org = neotomadb_nabuorg(start=load_prov) - - summarize = neotomadb_summarize(start=load_uploadrelease) - upload_summarize = neotomadb_upload_summarize(start=summarize) - -# run after load - report_msgraph = neotomadb_missingreport_graph(start=summarize) - report_graph = neotomadb_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_opencoredata.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_opencoredata.py deleted file mode 100644 index 1d84c757..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_opencoredata.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def opencoredata_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def opencoredata_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "opencoredata") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def opencoredata_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "opencoredata") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def opencoredata_nabuprov(context): - returned_value = gleanerio(context,("prov"), "opencoredata") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def opencoredata_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "opencoredata") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def opencoredata_naburelease(context): - returned_value = gleanerio(context,("release"), "opencoredata") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def opencoredata_uploadrelease(context): - returned_value = post_to_graph("opencoredata", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def opencoredata_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="opencoredata") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "opencoredata" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def opencoredata_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="opencoredata") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "opencoredata" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def opencoredata_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="opencoredata") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "opencoredata" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def opencoredata_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="opencoredata") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "opencoredata" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def opencoredata_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "opencoredata" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def opencoredata_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "opencoredata" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def opencoredata_upload_summarize(context): - returned_value = post_to_graph("opencoredata",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="opencoredata"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="opencoredata" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_opencoredata(): - containers = opencoredata_getImage() - harvest = opencoredata_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = opencoredata_missingreport_s3(start=harvest) - report_idstat = opencoredata_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = opencoredata_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="opencoredata") - load_release = opencoredata_naburelease(start=harvest) - load_uploadrelease = opencoredata_uploadrelease(start=load_release) - - load_prune = opencoredata_nabu_prune(start=load_uploadrelease) - load_prov = opencoredata_nabuprov(start=load_prune) - load_org = opencoredata_nabuorg(start=load_prov) - - summarize = opencoredata_summarize(start=load_uploadrelease) - upload_summarize = opencoredata_upload_summarize(start=summarize) - -# run after load - report_msgraph = opencoredata_missingreport_graph(start=summarize) - report_graph = opencoredata_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_opentopography.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_opentopography.py deleted file mode 100644 index 4c82314d..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_opentopography.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def opentopography_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def opentopography_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "opentopography") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def opentopography_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "opentopography") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def opentopography_nabuprov(context): - returned_value = gleanerio(context,("prov"), "opentopography") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def opentopography_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "opentopography") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def opentopography_naburelease(context): - returned_value = gleanerio(context,("release"), "opentopography") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def opentopography_uploadrelease(context): - returned_value = post_to_graph("opentopography", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def opentopography_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="opentopography") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "opentopography" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def opentopography_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="opentopography") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "opentopography" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def opentopography_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="opentopography") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "opentopography" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def opentopography_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="opentopography") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "opentopography" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def opentopography_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "opentopography" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def opentopography_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "opentopography" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def opentopography_upload_summarize(context): - returned_value = post_to_graph("opentopography",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="opentopography"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="opentopography" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_opentopography(): - containers = opentopography_getImage() - harvest = opentopography_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = opentopography_missingreport_s3(start=harvest) - report_idstat = opentopography_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = opentopography_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="opentopography") - load_release = opentopography_naburelease(start=harvest) - load_uploadrelease = opentopography_uploadrelease(start=load_release) - - load_prune = opentopography_nabu_prune(start=load_uploadrelease) - load_prov = opentopography_nabuprov(start=load_prune) - load_org = opentopography_nabuorg(start=load_prov) - - summarize = opentopography_summarize(start=load_uploadrelease) - upload_summarize = opentopography_upload_summarize(start=summarize) - -# run after load - report_msgraph = opentopography_missingreport_graph(start=summarize) - report_graph = opentopography_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_resource_registry.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_resource_registry.py deleted file mode 100644 index 239da8c1..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_resource_registry.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def resource_registry_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def resource_registry_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "resource_registry") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def resource_registry_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "resource_registry") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def resource_registry_nabuprov(context): - returned_value = gleanerio(context,("prov"), "resource_registry") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def resource_registry_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "resource_registry") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def resource_registry_naburelease(context): - returned_value = gleanerio(context,("release"), "resource_registry") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def resource_registry_uploadrelease(context): - returned_value = post_to_graph("resource_registry", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def resource_registry_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="resource_registry") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "resource_registry" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def resource_registry_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="resource_registry") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "resource_registry" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def resource_registry_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="resource_registry") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "resource_registry" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def resource_registry_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="resource_registry") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "resource_registry" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def resource_registry_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "resource_registry" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def resource_registry_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "resource_registry" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def resource_registry_upload_summarize(context): - returned_value = post_to_graph("resource_registry",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="resource_registry"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="resource_registry" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_resource_registry(): - containers = resource_registry_getImage() - harvest = resource_registry_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = resource_registry_missingreport_s3(start=harvest) - report_idstat = resource_registry_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = resource_registry_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="resource_registry") - load_release = resource_registry_naburelease(start=harvest) - load_uploadrelease = resource_registry_uploadrelease(start=load_release) - - load_prune = resource_registry_nabu_prune(start=load_uploadrelease) - load_prov = resource_registry_nabuprov(start=load_prune) - load_org = resource_registry_nabuorg(start=load_prov) - - summarize = resource_registry_summarize(start=load_uploadrelease) - upload_summarize = resource_registry_upload_summarize(start=summarize) - -# run after load - report_msgraph = resource_registry_missingreport_graph(start=summarize) - report_graph = resource_registry_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_ssdbiodp.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_ssdbiodp.py deleted file mode 100644 index 1e89ce42..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_ssdbiodp.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def ssdbiodp_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def ssdbiodp_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "ssdbiodp") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def ssdbiodp_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "ssdbiodp") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def ssdbiodp_nabuprov(context): - returned_value = gleanerio(context,("prov"), "ssdbiodp") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def ssdbiodp_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "ssdbiodp") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def ssdbiodp_naburelease(context): - returned_value = gleanerio(context,("release"), "ssdbiodp") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def ssdbiodp_uploadrelease(context): - returned_value = post_to_graph("ssdbiodp", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def ssdbiodp_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ssdbiodp") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "ssdbiodp" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def ssdbiodp_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ssdbiodp") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "ssdbiodp" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def ssdbiodp_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ssdbiodp") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "ssdbiodp" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def ssdbiodp_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ssdbiodp") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "ssdbiodp" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def ssdbiodp_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "ssdbiodp" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def ssdbiodp_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "ssdbiodp" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def ssdbiodp_upload_summarize(context): - returned_value = post_to_graph("ssdbiodp",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="ssdbiodp"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="ssdbiodp" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_ssdbiodp(): - containers = ssdbiodp_getImage() - harvest = ssdbiodp_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = ssdbiodp_missingreport_s3(start=harvest) - report_idstat = ssdbiodp_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = ssdbiodp_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="ssdbiodp") - load_release = ssdbiodp_naburelease(start=harvest) - load_uploadrelease = ssdbiodp_uploadrelease(start=load_release) - - load_prune = ssdbiodp_nabu_prune(start=load_uploadrelease) - load_prov = ssdbiodp_nabuprov(start=load_prune) - load_org = ssdbiodp_nabuorg(start=load_prov) - - summarize = ssdbiodp_summarize(start=load_uploadrelease) - upload_summarize = ssdbiodp_upload_summarize(start=summarize) - -# run after load - report_msgraph = ssdbiodp_missingreport_graph(start=summarize) - report_graph = ssdbiodp_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_ucar.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_ucar.py deleted file mode 100644 index 444799dd..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_ucar.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def ucar_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def ucar_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "ucar") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def ucar_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "ucar") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def ucar_nabuprov(context): - returned_value = gleanerio(context,("prov"), "ucar") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def ucar_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "ucar") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def ucar_naburelease(context): - returned_value = gleanerio(context,("release"), "ucar") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def ucar_uploadrelease(context): - returned_value = post_to_graph("ucar", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def ucar_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ucar") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "ucar" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def ucar_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ucar") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "ucar" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def ucar_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ucar") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "ucar" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def ucar_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ucar") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "ucar" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def ucar_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "ucar" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def ucar_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "ucar" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def ucar_upload_summarize(context): - returned_value = post_to_graph("ucar",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="ucar"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="ucar" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_ucar(): - containers = ucar_getImage() - harvest = ucar_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = ucar_missingreport_s3(start=harvest) - report_idstat = ucar_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = ucar_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="ucar") - load_release = ucar_naburelease(start=harvest) - load_uploadrelease = ucar_uploadrelease(start=load_release) - - load_prune = ucar_nabu_prune(start=load_uploadrelease) - load_prov = ucar_nabuprov(start=load_prune) - load_org = ucar_nabuorg(start=load_prov) - - summarize = ucar_summarize(start=load_uploadrelease) - upload_summarize = ucar_upload_summarize(start=summarize) - -# run after load - report_msgraph = ucar_missingreport_graph(start=summarize) - report_graph = ucar_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_unavco.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_unavco.py deleted file mode 100644 index df1085ae..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_unavco.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def unavco_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def unavco_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "unavco") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def unavco_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "unavco") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def unavco_nabuprov(context): - returned_value = gleanerio(context,("prov"), "unavco") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def unavco_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "unavco") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def unavco_naburelease(context): - returned_value = gleanerio(context,("release"), "unavco") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def unavco_uploadrelease(context): - returned_value = post_to_graph("unavco", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def unavco_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="unavco") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "unavco" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def unavco_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="unavco") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "unavco" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def unavco_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="unavco") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "unavco" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def unavco_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="unavco") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "unavco" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def unavco_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "unavco" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def unavco_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "unavco" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def unavco_upload_summarize(context): - returned_value = post_to_graph("unavco",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="unavco"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="unavco" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_unavco(): - containers = unavco_getImage() - harvest = unavco_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = unavco_missingreport_s3(start=harvest) - report_idstat = unavco_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = unavco_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="unavco") - load_release = unavco_naburelease(start=harvest) - load_uploadrelease = unavco_uploadrelease(start=load_release) - - load_prune = unavco_nabu_prune(start=load_uploadrelease) - load_prov = unavco_nabuprov(start=load_prune) - load_org = unavco_nabuorg(start=load_prov) - - summarize = unavco_summarize(start=load_uploadrelease) - upload_summarize = unavco_upload_summarize(start=summarize) - -# run after load - report_msgraph = unavco_missingreport_graph(start=summarize) - report_graph = unavco_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_unidata.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_unidata.py deleted file mode 100644 index 5af2311e..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_unidata.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def unidata_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def unidata_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "unidata") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def unidata_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "unidata") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def unidata_nabuprov(context): - returned_value = gleanerio(context,("prov"), "unidata") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def unidata_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "unidata") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def unidata_naburelease(context): - returned_value = gleanerio(context,("release"), "unidata") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def unidata_uploadrelease(context): - returned_value = post_to_graph("unidata", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def unidata_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="unidata") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "unidata" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def unidata_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="unidata") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "unidata" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def unidata_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="unidata") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "unidata" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def unidata_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="unidata") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "unidata" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def unidata_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "unidata" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def unidata_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "unidata" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def unidata_upload_summarize(context): - returned_value = post_to_graph("unidata",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="unidata"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="unidata" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_unidata(): - containers = unidata_getImage() - harvest = unidata_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = unidata_missingreport_s3(start=harvest) - report_idstat = unidata_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = unidata_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="unidata") - load_release = unidata_naburelease(start=harvest) - load_uploadrelease = unidata_uploadrelease(start=load_release) - - load_prune = unidata_nabu_prune(start=load_uploadrelease) - load_prov = unidata_nabuprov(start=load_prune) - load_org = unidata_nabuorg(start=load_prov) - - summarize = unidata_summarize(start=load_uploadrelease) - upload_summarize = unidata_upload_summarize(start=summarize) - -# run after load - report_msgraph = unidata_missingreport_graph(start=summarize) - report_graph = unidata_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_usapdc.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_usapdc.py deleted file mode 100644 index ed2f7e9c..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_usapdc.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def usapdc_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def usapdc_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "usapdc") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def usapdc_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "usapdc") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def usapdc_nabuprov(context): - returned_value = gleanerio(context,("prov"), "usapdc") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def usapdc_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "usapdc") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def usapdc_naburelease(context): - returned_value = gleanerio(context,("release"), "usapdc") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def usapdc_uploadrelease(context): - returned_value = post_to_graph("usapdc", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def usapdc_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="usapdc") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "usapdc" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def usapdc_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="usapdc") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "usapdc" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def usapdc_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="usapdc") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "usapdc" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def usapdc_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="usapdc") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "usapdc" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def usapdc_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "usapdc" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def usapdc_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "usapdc" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def usapdc_upload_summarize(context): - returned_value = post_to_graph("usapdc",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="usapdc"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="usapdc" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_usapdc(): - containers = usapdc_getImage() - harvest = usapdc_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = usapdc_missingreport_s3(start=harvest) - report_idstat = usapdc_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = usapdc_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="usapdc") - load_release = usapdc_naburelease(start=harvest) - load_uploadrelease = usapdc_uploadrelease(start=load_release) - - load_prune = usapdc_nabu_prune(start=load_uploadrelease) - load_prov = usapdc_nabuprov(start=load_prune) - load_org = usapdc_nabuorg(start=load_prov) - - summarize = usapdc_summarize(start=load_uploadrelease) - upload_summarize = usapdc_upload_summarize(start=summarize) - -# run after load - report_msgraph = usapdc_missingreport_graph(start=summarize) - report_graph = usapdc_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_wifire.py b/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_wifire.py deleted file mode 100644 index 60f465bc..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_wifire.py +++ /dev/null @@ -1,819 +0,0 @@ -import distutils -import logging -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset -from ec.graph.manageGraph import ManageBlazegraph as mg -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# # -# path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) -# WHEN RUNNING dagster-dev, this needs to be a path to a local file -## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") - -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) - -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } - -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -#GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) - -SUMMARY_PATH = 'graphs/summary' -RELEASE_PATH = 'graphs/latest' -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url -def _graphSummaryEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANERIO_SUMMARY_GRAPH_NAMESPACE}/sparql" - return url -def _pythonMinioAddress(url, port = None): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - if port is not None: - PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS,GLEANER_MINIO_PORT ) - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - - server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) - bucket = GLEANER_MINIO_BUCKET - release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" - # BLAZEGRAPH SPECIFIC - # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - # r = requests.post(url) - # log.debug(f' status:{r.status_code}') # status:404 - # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - # if r.status_code == 200: - # # '' - # if 'data modified="0"' in r.text: - # get_dagster_logger().info(f'graph: no data inserted ') - # raise Exception("No Data Added: " + r.text) - # return True - # else: - # get_dagster_logger().info(f'graph: error') - # raise Exception(f' graph: insert failed: status:{r.status_code}') - - ### GENERIC LOAD FROM - url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - loadfrom = {'update': f'LOAD <{release_url}>'} - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } - r = requests.post(url, headers=headers, data=loadfrom ) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') - if r.status_code == 200: - get_dagster_logger().info(f'graph load response: {str(r.text)} ') - # '' - if 'mutationCount=0' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - #raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error {str(r.text)}') - raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) # the python needs to be wrapped, this does not - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def wifire_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def wifire_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "wifire") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wifire_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "wifire") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wifire_nabuprov(context): - returned_value = gleanerio(context,("prov"), "wifire") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wifire_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "wifire") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wifire_naburelease(context): - returned_value = gleanerio(context,("release"), "wifire") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wifire_uploadrelease(context): - returned_value = post_to_graph("wifire", extension="nq") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def wifire_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="wifire") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "wifire" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wifire_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="wifire") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "wifire" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wifire_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="wifire") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "wifire" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wifire_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="wifire") - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "wifire" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wifire_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "wifire" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - -class S3ObjectInfo: - bucket_name="" - object_name="" - -@op(ins={"start": In(Nothing)}) -def wifire_summarize(context) : - s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) - bucket = GLEANER_MINIO_BUCKET - source_name = "wifire" - endpoint = _graphEndpoint() # getting data, not uploading data - summary_namespace = _graphSummaryEndpoint() - - - try: - - summarydf = get_summary4repoSubset(endpoint, source_name) - nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator - summaryttl = g.serialize(format='longturtle') - # Lets always write out file to s3, and insert as a separate process - # we might be able to make this an asset..., but would need to be acessible by http - # if not stored in s3 - objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post - s3ObjectInfo= S3ObjectInfo() - s3ObjectInfo.bucket_name=bucket - s3ObjectInfo.object_name=objectname - - s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo ) - #inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") - #if not inserted: - # raise Exception("Loading to graph failed.") - except Exception as e: - # use dagster logger - get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") - raise Exception(f"Loading Summary graph failed. {str(e)}") - return 1 - - return - -@op(ins={"start": In(Nothing)}) -def wifire_upload_summarize(context): - returned_value = post_to_graph("wifire",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload summary returned {r} ") - return - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="wifire"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) -# bucket = GLEANER_MINIO_BUCKET -# source_name="wifire" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_wifire(): - containers = wifire_getImage() - harvest = wifire_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = wifire_missingreport_s3(start=harvest) - report_idstat = wifire_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = wifire_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="wifire") - load_release = wifire_naburelease(start=harvest) - load_uploadrelease = wifire_uploadrelease(start=load_release) - - load_prune = wifire_nabu_prune(start=load_uploadrelease) - load_prov = wifire_nabuprov(start=load_prune) - load_org = wifire_nabuorg(start=load_prov) - - summarize = wifire_summarize(start=load_uploadrelease) - upload_summarize = wifire_upload_summarize(start=summarize) - -# run after load - report_msgraph = wifire_missingreport_graph(start=summarize) - report_graph = wifire_graph_reports(start=report_msgraph) - - - - - - - diff --git a/dagster/implnets/generatedCode/implnet-eco/output/repositories/repository.py b/dagster/implnets/generatedCode/implnet-eco/output/repositories/repository.py deleted file mode 100644 index e77f2141..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/repositories/repository.py +++ /dev/null @@ -1,67 +0,0 @@ -from dagster import repository -from jobs.implnet_jobs_amgeo import implnet_job_amgeo -from sch.implnet_sch_amgeo import implnet_sch_amgeo -from jobs.implnet_jobs_aquadocs import implnet_job_aquadocs -from sch.implnet_sch_aquadocs import implnet_sch_aquadocs -from jobs.implnet_jobs_bcodmo import implnet_job_bcodmo -from sch.implnet_sch_bcodmo import implnet_sch_bcodmo -from jobs.implnet_jobs_cchdo import implnet_job_cchdo -from sch.implnet_sch_cchdo import implnet_sch_cchdo -from jobs.implnet_jobs_datadiscoverystudio import implnet_job_datadiscoverystudio -from sch.implnet_sch_datadiscoverystudio import implnet_sch_datadiscoverystudio -from jobs.implnet_jobs_designsafe import implnet_job_designsafe -from sch.implnet_sch_designsafe import implnet_sch_designsafe -from jobs.implnet_jobs_earthchem import implnet_job_earthchem -from sch.implnet_sch_earthchem import implnet_sch_earthchem -from jobs.implnet_jobs_ecrr_examples import implnet_job_ecrr_examples -from sch.implnet_sch_ecrr_examples import implnet_sch_ecrr_examples -from jobs.implnet_jobs_edi import implnet_job_edi -from sch.implnet_sch_edi import implnet_sch_edi -from jobs.implnet_jobs_geocodes_demo_datasets import implnet_job_geocodes_demo_datasets -from sch.implnet_sch_geocodes_demo_datasets import implnet_sch_geocodes_demo_datasets -from jobs.implnet_jobs_geocodes_examples import implnet_job_geocodes_examples -from sch.implnet_sch_geocodes_examples import implnet_sch_geocodes_examples -from jobs.implnet_jobs_hydroshare import implnet_job_hydroshare -from sch.implnet_sch_hydroshare import implnet_sch_hydroshare -from jobs.implnet_jobs_iedadata import implnet_job_iedadata -from sch.implnet_sch_iedadata import implnet_sch_iedadata -from jobs.implnet_jobs_iris import implnet_job_iris -from sch.implnet_sch_iris import implnet_sch_iris -from jobs.implnet_jobs_linkedearth import implnet_job_linkedearth -from sch.implnet_sch_linkedearth import implnet_sch_linkedearth -from jobs.implnet_jobs_lipdverse import implnet_job_lipdverse -from sch.implnet_sch_lipdverse import implnet_sch_lipdverse -from jobs.implnet_jobs_magic import implnet_job_magic -from sch.implnet_sch_magic import implnet_sch_magic -from jobs.implnet_jobs_neon import implnet_job_neon -from sch.implnet_sch_neon import implnet_sch_neon -from jobs.implnet_jobs_neotomadb import implnet_job_neotomadb -from sch.implnet_sch_neotomadb import implnet_sch_neotomadb -from jobs.implnet_jobs_opencoredata import implnet_job_opencoredata -from sch.implnet_sch_opencoredata import implnet_sch_opencoredata -from jobs.implnet_jobs_opentopography import implnet_job_opentopography -from sch.implnet_sch_opentopography import implnet_sch_opentopography -from jobs.implnet_jobs_r2r import implnet_job_r2r -from sch.implnet_sch_r2r import implnet_sch_r2r -from jobs.implnet_jobs_resource_registry import implnet_job_resource_registry -from sch.implnet_sch_resource_registry import implnet_sch_resource_registry -from jobs.implnet_jobs_ssdbiodp import implnet_job_ssdbiodp -from sch.implnet_sch_ssdbiodp import implnet_sch_ssdbiodp -from jobs.implnet_jobs_ucar import implnet_job_ucar -from sch.implnet_sch_ucar import implnet_sch_ucar -from jobs.implnet_jobs_unavco import implnet_job_unavco -from sch.implnet_sch_unavco import implnet_sch_unavco -from jobs.implnet_jobs_unidata import implnet_job_unidata -from sch.implnet_sch_unidata import implnet_sch_unidata -from jobs.implnet_jobs_usapdc import implnet_job_usapdc -from sch.implnet_sch_usapdc import implnet_sch_usapdc -from jobs.implnet_jobs_wifire import implnet_job_wifire -from sch.implnet_sch_wifire import implnet_sch_wifire - -@repository -def gleaner(): - jobs = [implnet_job_amgeo, implnet_job_aquadocs, implnet_job_bcodmo, implnet_job_cchdo, implnet_job_datadiscoverystudio, implnet_job_designsafe, implnet_job_earthchem, implnet_job_ecrr_examples, implnet_job_edi, implnet_job_geocodes_demo_datasets, implnet_job_geocodes_examples, implnet_job_hydroshare, implnet_job_iedadata, implnet_job_iris, implnet_job_linkedearth, implnet_job_lipdverse, implnet_job_magic, implnet_job_neon, implnet_job_neotomadb, implnet_job_opencoredata, implnet_job_opentopography, implnet_job_r2r, implnet_job_resource_registry, implnet_job_ssdbiodp, implnet_job_ucar, implnet_job_unavco, implnet_job_unidata, implnet_job_usapdc, implnet_job_wifire] - schedules = [implnet_sch_amgeo, implnet_sch_aquadocs, implnet_sch_bcodmo, implnet_sch_cchdo, implnet_sch_datadiscoverystudio, implnet_sch_designsafe, implnet_sch_earthchem, implnet_sch_ecrr_examples, implnet_sch_edi, implnet_sch_geocodes_demo_datasets, implnet_sch_geocodes_examples, implnet_sch_hydroshare, implnet_sch_iedadata, implnet_sch_iris, implnet_sch_linkedearth, implnet_sch_lipdverse, implnet_sch_magic, implnet_sch_neon, implnet_sch_neotomadb, implnet_sch_opencoredata, implnet_sch_opentopography, implnet_sch_r2r, implnet_sch_resource_registry, implnet_sch_ssdbiodp, implnet_sch_ucar, implnet_sch_unavco, implnet_sch_unidata, implnet_sch_usapdc, implnet_sch_wifire] - - - return jobs + schedules diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_amgeo.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_amgeo.py deleted file mode 100644 index 8e861599..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_amgeo.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_amgeo import implnet_job_amgeo - -@schedule(cron_schedule="0 0 1 * *", job=implnet_job_amgeo, execution_timezone="US/Central") -def implnet_sch_amgeo(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_aquadocs.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_aquadocs.py deleted file mode 100644 index 00c1bf64..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_aquadocs.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_aquadocs import implnet_job_aquadocs - -@schedule(cron_schedule="0 6 1 * *", job=implnet_job_aquadocs, execution_timezone="US/Central") -def implnet_sch_aquadocs(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_bcodmo.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_bcodmo.py deleted file mode 100644 index f45ed457..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_bcodmo.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_bcodmo import implnet_job_bcodmo - -@schedule(cron_schedule="0 12 1 * *", job=implnet_job_bcodmo, execution_timezone="US/Central") -def implnet_sch_bcodmo(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_cchdo.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_cchdo.py deleted file mode 100644 index 09ae1e29..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_cchdo.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cchdo import implnet_job_cchdo - -@schedule(cron_schedule="0 18 1 * *", job=implnet_job_cchdo, execution_timezone="US/Central") -def implnet_sch_cchdo(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_datadiscoverystudio.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_datadiscoverystudio.py deleted file mode 100644 index 2d15eb65..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_datadiscoverystudio.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_datadiscoverystudio import implnet_job_datadiscoverystudio - -@schedule(cron_schedule="0 0 2 * *", job=implnet_job_datadiscoverystudio, execution_timezone="US/Central") -def implnet_sch_datadiscoverystudio(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_designsafe.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_designsafe.py deleted file mode 100644 index 119c6b48..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_designsafe.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_designsafe import implnet_job_designsafe - -@schedule(cron_schedule="0 6 2 * *", job=implnet_job_designsafe, execution_timezone="US/Central") -def implnet_sch_designsafe(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_earthchem.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_earthchem.py deleted file mode 100644 index c7860c4d..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_earthchem.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_earthchem import implnet_job_earthchem - -@schedule(cron_schedule="0 12 2 * *", job=implnet_job_earthchem, execution_timezone="US/Central") -def implnet_sch_earthchem(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_ecrr_examples.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_ecrr_examples.py deleted file mode 100644 index 0a6d6ffa..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_ecrr_examples.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_ecrr_examples import implnet_job_ecrr_examples - -@schedule(cron_schedule="0 18 2 * *", job=implnet_job_ecrr_examples, execution_timezone="US/Central") -def implnet_sch_ecrr_examples(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_edi.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_edi.py deleted file mode 100644 index a864502d..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_edi.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_edi import implnet_job_edi - -@schedule(cron_schedule="0 0 3 * *", job=implnet_job_edi, execution_timezone="US/Central") -def implnet_sch_edi(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_geocodes_demo_datasets.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_geocodes_demo_datasets.py deleted file mode 100644 index 092d312f..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_geocodes_demo_datasets.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_geocodes_demo_datasets import implnet_job_geocodes_demo_datasets - -@schedule(cron_schedule="0 6 3 * *", job=implnet_job_geocodes_demo_datasets, execution_timezone="US/Central") -def implnet_sch_geocodes_demo_datasets(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_geocodes_examples.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_geocodes_examples.py deleted file mode 100644 index 2e409a45..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_geocodes_examples.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_geocodes_examples import implnet_job_geocodes_examples - -@schedule(cron_schedule="0 12 3 * *", job=implnet_job_geocodes_examples, execution_timezone="US/Central") -def implnet_sch_geocodes_examples(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_hydroshare.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_hydroshare.py deleted file mode 100644 index 8fd85c67..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_hydroshare.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_hydroshare import implnet_job_hydroshare - -@schedule(cron_schedule="0 18 3 * *", job=implnet_job_hydroshare, execution_timezone="US/Central") -def implnet_sch_hydroshare(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_iedadata.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_iedadata.py deleted file mode 100644 index 6010ae2c..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_iedadata.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_iedadata import implnet_job_iedadata - -@schedule(cron_schedule="0 0 4 * *", job=implnet_job_iedadata, execution_timezone="US/Central") -def implnet_sch_iedadata(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_iris.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_iris.py deleted file mode 100644 index d65602b7..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_iris.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_iris import implnet_job_iris - -@schedule(cron_schedule="0 6 4 * *", job=implnet_job_iris, execution_timezone="US/Central") -def implnet_sch_iris(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_linkedearth.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_linkedearth.py deleted file mode 100644 index 5ba7776b..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_linkedearth.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_linkedearth import implnet_job_linkedearth - -@schedule(cron_schedule="0 12 4 * *", job=implnet_job_linkedearth, execution_timezone="US/Central") -def implnet_sch_linkedearth(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_lipdverse.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_lipdverse.py deleted file mode 100644 index 3483de64..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_lipdverse.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_lipdverse import implnet_job_lipdverse - -@schedule(cron_schedule="0 18 4 * *", job=implnet_job_lipdverse, execution_timezone="US/Central") -def implnet_sch_lipdverse(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_magic.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_magic.py deleted file mode 100644 index 62859a3c..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_magic.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_magic import implnet_job_magic - -@schedule(cron_schedule="0 0 5 * *", job=implnet_job_magic, execution_timezone="US/Central") -def implnet_sch_magic(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_neon.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_neon.py deleted file mode 100644 index 5ac234b6..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_neon.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_neon import implnet_job_neon - -@schedule(cron_schedule="0 6 5 * *", job=implnet_job_neon, execution_timezone="US/Central") -def implnet_sch_neon(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_neotomadb.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_neotomadb.py deleted file mode 100644 index 1e928579..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_neotomadb.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_neotomadb import implnet_job_neotomadb - -@schedule(cron_schedule="0 12 5 * *", job=implnet_job_neotomadb, execution_timezone="US/Central") -def implnet_sch_neotomadb(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_opencoredata.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_opencoredata.py deleted file mode 100644 index 626f7e9c..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_opencoredata.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_opencoredata import implnet_job_opencoredata - -@schedule(cron_schedule="0 18 5 * *", job=implnet_job_opencoredata, execution_timezone="US/Central") -def implnet_sch_opencoredata(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_opentopography.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_opentopography.py deleted file mode 100644 index 3bc4a32b..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_opentopography.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_opentopography import implnet_job_opentopography - -@schedule(cron_schedule="0 0 6 * *", job=implnet_job_opentopography, execution_timezone="US/Central") -def implnet_sch_opentopography(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_r2r.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_r2r.py deleted file mode 100644 index 132e6a59..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_r2r.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_r2r import implnet_job_r2r - -@schedule(cron_schedule="0 6 6 * *", job=implnet_job_r2r, execution_timezone="US/Central") -def implnet_sch_r2r(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_resource_registry.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_resource_registry.py deleted file mode 100644 index 09713367..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_resource_registry.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_resource_registry import implnet_job_resource_registry - -@schedule(cron_schedule="0 12 6 * *", job=implnet_job_resource_registry, execution_timezone="US/Central") -def implnet_sch_resource_registry(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_ssdbiodp.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_ssdbiodp.py deleted file mode 100644 index c850fa6e..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_ssdbiodp.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_ssdbiodp import implnet_job_ssdbiodp - -@schedule(cron_schedule="0 18 6 * *", job=implnet_job_ssdbiodp, execution_timezone="US/Central") -def implnet_sch_ssdbiodp(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_ucar.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_ucar.py deleted file mode 100644 index 55161270..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_ucar.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_ucar import implnet_job_ucar - -@schedule(cron_schedule="0 0 7 * *", job=implnet_job_ucar, execution_timezone="US/Central") -def implnet_sch_ucar(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_unavco.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_unavco.py deleted file mode 100644 index fb6cbbfc..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_unavco.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_unavco import implnet_job_unavco - -@schedule(cron_schedule="0 6 7 * *", job=implnet_job_unavco, execution_timezone="US/Central") -def implnet_sch_unavco(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_unidata.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_unidata.py deleted file mode 100644 index 8915a422..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_unidata.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_unidata import implnet_job_unidata - -@schedule(cron_schedule="0 12 7 * *", job=implnet_job_unidata, execution_timezone="US/Central") -def implnet_sch_unidata(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_usapdc.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_usapdc.py deleted file mode 100644 index 170dfba5..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_usapdc.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_usapdc import implnet_job_usapdc - -@schedule(cron_schedule="0 18 7 * *", job=implnet_job_usapdc, execution_timezone="US/Central") -def implnet_sch_usapdc(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_wifire.py b/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_wifire.py deleted file mode 100644 index 746d27a1..00000000 --- a/dagster/implnets/generatedCode/implnet-eco/output/sch/implnet_sch_wifire.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_wifire import implnet_job_wifire - -@schedule(cron_schedule="0 0 1 * *", job=implnet_job_wifire, execution_timezone="US/Central") -def implnet_sch_wifire(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_aiannh0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_aiannh0.py deleted file mode 100644 index 985bc623..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_aiannh0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_aiannh0 import harvest_aiannh0 - -@job -def implnet_job_aiannh0(): - harvest_aiannh0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_autotest10.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_autotest10.py deleted file mode 100644 index 7febeb1e..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_autotest10.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_autotest10 import harvest_autotest10 - -@job -def implnet_job_autotest10(): - harvest_autotest10() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_autotest20.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_autotest20.py deleted file mode 100644 index 8d68d11f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_autotest20.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_autotest20 import harvest_autotest20 - -@job -def implnet_job_autotest20(): - harvest_autotest20() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cagagespids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cagagespids0.py deleted file mode 100644 index 79278858..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cagagespids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cagagespids0 import harvest_cagagespids0 - -@job -def implnet_job_cagagespids0(): - harvest_cagagespids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cbsa0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cbsa0.py deleted file mode 100644 index 0023ea51..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cbsa0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cbsa0 import harvest_cbsa0 - -@job -def implnet_job_cbsa0(): - harvest_cbsa0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_chyldpilotids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_chyldpilotids0.py deleted file mode 100644 index e0b87163..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_chyldpilotids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_chyldpilotids0 import harvest_chyldpilotids0 - -@job -def implnet_job_chyldpilotids0(): - harvest_chyldpilotids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_counties0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_counties0.py deleted file mode 100644 index b6d2feab..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_counties0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_counties0 import harvest_counties0 - -@job -def implnet_job_counties0(): - harvest_counties0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisandrewsforestlterids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisandrewsforestlterids0.py deleted file mode 100644 index e8397930..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisandrewsforestlterids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisandrewsforestlterids0 import harvest_cuahsihisandrewsforestlterids0 - -@job -def implnet_job_cuahsihisandrewsforestlterids0(): - harvest_cuahsihisandrewsforestlterids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisbrazilucbids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisbrazilucbids0.py deleted file mode 100644 index 6fddea7b..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisbrazilucbids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisbrazilucbids0 import harvest_cuahsihisbrazilucbids0 - -@job -def implnet_job_cuahsihisbrazilucbids0(): - harvest_cuahsihisbrazilucbids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscalvinhhsids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscalvinhhsids0.py deleted file mode 100644 index d44b94ae..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscalvinhhsids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihiscalvinhhsids0 import harvest_cuahsihiscalvinhhsids0 - -@job -def implnet_job_cuahsihiscalvinhhsids0(): - harvest_cuahsihiscalvinhhsids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisccbepdapids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisccbepdapids0.py deleted file mode 100644 index f6ff9eeb..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisccbepdapids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisccbepdapids0 import harvest_cuahsihisccbepdapids0 - -@job -def implnet_job_cuahsihisccbepdapids0(): - harvest_cuahsihisccbepdapids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscedarriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscedarriverids0.py deleted file mode 100644 index ece85dc1..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscedarriverids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihiscedarriverids0 import harvest_cuahsihiscedarriverids0 - -@job -def implnet_job_cuahsihiscedarriverids0(): - harvest_cuahsihiscedarriverids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisclarksburgspids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisclarksburgspids0.py deleted file mode 100644 index ce0d45ec..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisclarksburgspids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisclarksburgspids0 import harvest_cuahsihisclarksburgspids0 - -@job -def implnet_job_cuahsihisclarksburgspids0(): - harvest_cuahsihisclarksburgspids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscocorahsids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscocorahsids0.py deleted file mode 100644 index 618cde85..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscocorahsids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihiscocorahsids0 import harvest_cuahsihiscocorahsids0 - -@job -def implnet_job_cuahsihiscocorahsids0(): - harvest_cuahsihiscocorahsids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscrwaids0.py deleted file mode 100644 index 69e6b2bd..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscrwaids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihiscrwaids0 import harvest_cuahsihiscrwaids0 - -@job -def implnet_job_cuahsihiscrwaids0(): - harvest_cuahsihiscrwaids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscuisoids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscuisoids0.py deleted file mode 100644 index 13ef179b..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiscuisoids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihiscuisoids0 import harvest_cuahsihiscuisoids0 - -@job -def implnet_job_cuahsihiscuisoids0(): - harvest_cuahsihiscuisoids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoarizids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoarizids0.py deleted file mode 100644 index e7fcc57c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoarizids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisczoarizids0 import harvest_cuahsihisczoarizids0 - -@job -def implnet_job_cuahsihisczoarizids0(): - harvest_cuahsihisczoarizids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoboulderids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoboulderids0.py deleted file mode 100644 index dba5a017..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoboulderids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisczoboulderids0 import harvest_cuahsihisczoboulderids0 - -@job -def implnet_job_cuahsihisczoboulderids0(): - harvest_cuahsihisczoboulderids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczocatalinaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczocatalinaids0.py deleted file mode 100644 index 5912966c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczocatalinaids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisczocatalinaids0 import harvest_cuahsihisczocatalinaids0 - -@job -def implnet_job_cuahsihisczocatalinaids0(): - harvest_cuahsihisczocatalinaids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoluquilloids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoluquilloids0.py deleted file mode 100644 index 9e87cac2..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoluquilloids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisczoluquilloids0 import harvest_cuahsihisczoluquilloids0 - -@job -def implnet_job_cuahsihisczoluquilloids0(): - harvest_cuahsihisczoluquilloids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczomercedids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczomercedids0.py deleted file mode 100644 index 4c23dda9..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczomercedids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisczomercedids0 import harvest_cuahsihisczomercedids0 - -@job -def implnet_job_cuahsihisczomercedids0(): - harvest_cuahsihisczomercedids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczopsuids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczopsuids0.py deleted file mode 100644 index 56914fae..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczopsuids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisczopsuids0 import harvest_cuahsihisczopsuids0 - -@job -def implnet_job_cuahsihisczopsuids0(): - harvest_cuahsihisczopsuids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoudelids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoudelids0.py deleted file mode 100644 index 96309a4c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisczoudelids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisczoudelids0 import harvest_cuahsihisczoudelids0 - -@job -def implnet_job_cuahsihisczoudelids0(): - harvest_cuahsihisczoudelids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisdrwiids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisdrwiids0.py deleted file mode 100644 index 6863e32d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisdrwiids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisdrwiids0 import harvest_cuahsihisdrwiids0 - -@job -def implnet_job_cuahsihisdrwiids0(): - harvest_cuahsihisdrwiids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisfarmrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisfarmrwaids0.py deleted file mode 100644 index 97774eed..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisfarmrwaids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisfarmrwaids0 import harvest_cuahsihisfarmrwaids0 - -@job -def implnet_job_cuahsihisfarmrwaids0(): - harvest_cuahsihisfarmrwaids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisfcelterids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisfcelterids0.py deleted file mode 100644 index 76f0027e..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisfcelterids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisfcelterids0 import harvest_cuahsihisfcelterids0 - -@job -def implnet_job_cuahsihisfcelterids0(): - harvest_cuahsihisfcelterids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisfrcwqmids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisfrcwqmids0.py deleted file mode 100644 index 80fc98f6..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisfrcwqmids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisfrcwqmids0 import harvest_cuahsihisfrcwqmids0 - -@job -def implnet_job_cuahsihisfrcwqmids0(): - harvest_cuahsihisfrcwqmids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisghcnids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisghcnids0.py deleted file mode 100644 index 89dcb842..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisghcnids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisghcnids0 import harvest_cuahsihisghcnids0 - -@job -def implnet_job_cuahsihisghcnids0(): - harvest_cuahsihisghcnids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisglacialridgeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisglacialridgeids0.py deleted file mode 100644 index 78d05082..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisglacialridgeids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisglacialridgeids0 import harvest_cuahsihisglacialridgeids0 - -@job -def implnet_job_cuahsihisglacialridgeids0(): - harvest_cuahsihisglacialridgeids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleonauburnids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleonauburnids0.py deleted file mode 100644 index 9283dc5f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleonauburnids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisgleonauburnids0 import harvest_cuahsihisgleonauburnids0 - -@job -def implnet_job_cuahsihisgleonauburnids0(): - harvest_cuahsihisgleonauburnids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleondorsetids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleondorsetids0.py deleted file mode 100644 index 94f4e2ff..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleondorsetids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisgleondorsetids0 import harvest_cuahsihisgleondorsetids0 - -@job -def implnet_job_cuahsihisgleondorsetids0(): - harvest_cuahsihisgleondorsetids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleonlakeannieids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleonlakeannieids0.py deleted file mode 100644 index 7ab573cd..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleonlakeannieids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisgleonlakeannieids0 import harvest_cuahsihisgleonlakeannieids0 - -@job -def implnet_job_cuahsihisgleonlakeannieids0(): - harvest_cuahsihisgleonlakeannieids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleonsunapeeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleonsunapeeids0.py deleted file mode 100644 index 325a6398..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgleonsunapeeids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisgleonsunapeeids0 import harvest_cuahsihisgleonsunapeeids0 - -@job -def implnet_job_cuahsihisgleonsunapeeids0(): - harvest_cuahsihisgleonsunapeeids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisglobalriversobservatoryids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisglobalriversobservatoryids0.py deleted file mode 100644 index 25f50f3d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisglobalriversobservatoryids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisglobalriversobservatoryids0 import harvest_cuahsihisglobalriversobservatoryids0 - -@job -def implnet_job_cuahsihisglobalriversobservatoryids0(): - harvest_cuahsihisglobalriversobservatoryids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgonggaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgonggaids0.py deleted file mode 100644 index 6665bd2c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisgonggaids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisgonggaids0 import harvest_cuahsihisgonggaids0 - -@job -def implnet_job_cuahsihisgonggaids0(): - harvest_cuahsihisgonggaids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihishassbergeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihishassbergeids0.py deleted file mode 100644 index d04b02c3..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihishassbergeids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihishassbergeids0 import harvest_cuahsihishassbergeids0 - -@job -def implnet_job_cuahsihishassbergeids0(): - harvest_cuahsihishassbergeids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihishydrodataczdids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihishydrodataczdids0.py deleted file mode 100644 index ee8b0f69..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihishydrodataczdids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihishydrodataczdids0 import harvest_cuahsihishydrodataczdids0 - -@job -def implnet_job_cuahsihishydrodataczdids0(): - harvest_cuahsihishydrodataczdids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihishydrodataczhrids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihishydrodataczhrids0.py deleted file mode 100644 index ed6554c7..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihishydrodataczhrids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihishydrodataczhrids0 import harvest_cuahsihishydrodataczhrids0 - -@job -def implnet_job_cuahsihishydrodataczhrids0(): - harvest_cuahsihishydrodataczhrids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisieeratwilkesuniversityids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisieeratwilkesuniversityids0.py deleted file mode 100644 index 57a2a259..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisieeratwilkesuniversityids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisieeratwilkesuniversityids0 import harvest_cuahsihisieeratwilkesuniversityids0 - -@job -def implnet_job_cuahsihisieeratwilkesuniversityids0(): - harvest_cuahsihisieeratwilkesuniversityids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisirwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisirwaids0.py deleted file mode 100644 index b07f9970..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisirwaids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisirwaids0 import harvest_cuahsihisirwaids0 - -@job -def implnet_job_cuahsihisirwaids0(): - harvest_cuahsihisirwaids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisisbenaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisisbenaids0.py deleted file mode 100644 index b282caad..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisisbenaids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisisbenaids0 import harvest_cuahsihisisbenaids0 - -@job -def implnet_job_cuahsihisisbenaids0(): - harvest_cuahsihisisbenaids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiskansasweatherdataids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiskansasweatherdataids0.py deleted file mode 100644 index 23913071..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihiskansasweatherdataids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihiskansasweatherdataids0 import harvest_cuahsihiskansasweatherdataids0 - -@job -def implnet_job_cuahsihiskansasweatherdataids0(): - harvest_cuahsihiskansasweatherdataids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislaselvastreamdischargeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislaselvastreamdischargeids0.py deleted file mode 100644 index afb1f0b4..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislaselvastreamdischargeids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihislaselvastreamdischargeids0 import harvest_cuahsihislaselvastreamdischargeids0 - -@job -def implnet_job_cuahsihislaselvastreamdischargeids0(): - harvest_cuahsihislaselvastreamdischargeids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislczoodm2ids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislczoodm2ids0.py deleted file mode 100644 index 94df9db0..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislczoodm2ids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihislczoodm2ids0 import harvest_cuahsihislczoodm2ids0 - -@job -def implnet_job_cuahsihislczoodm2ids0(): - harvest_cuahsihislczoodm2ids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislittlebearriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislittlebearriverids0.py deleted file mode 100644 index 5b680e39..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislittlebearriverids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihislittlebearriverids0 import harvest_cuahsihislittlebearriverids0 - -@job -def implnet_job_cuahsihislittlebearriverids0(): - harvest_cuahsihislittlebearriverids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisloganrivergamutids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisloganrivergamutids0.py deleted file mode 100644 index ab0eccb7..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisloganrivergamutids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisloganrivergamutids0 import harvest_cuahsihisloganrivergamutids0 - -@job -def implnet_job_cuahsihisloganrivergamutids0(): - harvest_cuahsihisloganrivergamutids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisloganriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisloganriverids0.py deleted file mode 100644 index a4be3930..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisloganriverids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisloganriverids0 import harvest_cuahsihisloganriverids0 - -@job -def implnet_job_cuahsihisloganriverids0(): - harvest_cuahsihisloganriverids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislterntlwoodruffids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislterntlwoodruffids0.py deleted file mode 100644 index 33c6b441..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihislterntlwoodruffids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihislterntlwoodruffids0 import harvest_cuahsihislterntlwoodruffids0 - -@job -def implnet_job_cuahsihislterntlwoodruffids0(): - harvest_cuahsihislterntlwoodruffids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisluwlids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisluwlids0.py deleted file mode 100644 index a799e703..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisluwlids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisluwlids0 import harvest_cuahsihisluwlids0 - -@job -def implnet_job_cuahsihisluwlids0(): - harvest_cuahsihisluwlids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismaaeriids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismaaeriids0.py deleted file mode 100644 index 0a8b5de7..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismaaeriids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihismaaeriids0 import harvest_cuahsihismaaeriids0 - -@job -def implnet_job_cuahsihismaaeriids0(): - harvest_cuahsihismaaeriids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismazarriverprojectids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismazarriverprojectids0.py deleted file mode 100644 index ef82758b..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismazarriverprojectids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihismazarriverprojectids0 import harvest_cuahsihismazarriverprojectids0 - -@job -def implnet_job_cuahsihismazarriverprojectids0(): - harvest_cuahsihismazarriverprojectids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismmaatacamaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismmaatacamaids0.py deleted file mode 100644 index 8730f346..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismmaatacamaids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihismmaatacamaids0 import harvest_cuahsihismmaatacamaids0 - -@job -def implnet_job_cuahsihismmaatacamaids0(): - harvest_cuahsihismmaatacamaids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismobilecrowdhydrologyids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismobilecrowdhydrologyids0.py deleted file mode 100644 index 72244932..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismobilecrowdhydrologyids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihismobilecrowdhydrologyids0 import harvest_cuahsihismobilecrowdhydrologyids0 - -@job -def implnet_job_cuahsihismobilecrowdhydrologyids0(): - harvest_cuahsihismobilecrowdhydrologyids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismopexids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismopexids0.py deleted file mode 100644 index 66dc9e0f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismopexids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihismopexids0 import harvest_cuahsihismopexids0 - -@job -def implnet_job_cuahsihismopexids0(): - harvest_cuahsihismopexids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismuddyriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismuddyriverids0.py deleted file mode 100644 index 9c35a512..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismuddyriverids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihismuddyriverids0 import harvest_cuahsihismuddyriverids0 - -@job -def implnet_job_cuahsihismuddyriverids0(): - harvest_cuahsihismuddyriverids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismudlakeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismudlakeids0.py deleted file mode 100644 index 1699cbef..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismudlakeids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihismudlakeids0 import harvest_cuahsihismudlakeids0 - -@job -def implnet_job_cuahsihismudlakeids0(): - harvest_cuahsihismudlakeids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismwdisids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismwdisids0.py deleted file mode 100644 index 37e30248..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismwdisids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihismwdisids0 import harvest_cuahsihismwdisids0 - -@job -def implnet_job_cuahsihismwdisids0(): - harvest_cuahsihismwdisids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismwraids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismwraids0.py deleted file mode 100644 index a8d7ee25..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihismwraids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihismwraids0 import harvest_cuahsihismwraids0 - -@job -def implnet_job_cuahsihismwraids0(): - harvest_cuahsihismwraids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnashrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnashrwaids0.py deleted file mode 100644 index e74a54c3..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnashrwaids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisnashrwaids0 import harvest_cuahsihisnashrwaids0 - -@job -def implnet_job_cuahsihisnashrwaids0(): - harvest_cuahsihisnashrwaids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnceiww2ids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnceiww2ids0.py deleted file mode 100644 index 346fe0f9..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnceiww2ids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisnceiww2ids0 import harvest_cuahsihisnceiww2ids0 - -@job -def implnet_job_cuahsihisnceiww2ids0(): - harvest_cuahsihisnceiww2ids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisneonids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisneonids0.py deleted file mode 100644 index 150231ef..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisneonids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisneonids0 import harvest_cuahsihisneonids0 - -@job -def implnet_job_cuahsihisneonids0(): - harvest_cuahsihisneonids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnevadosids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnevadosids0.py deleted file mode 100644 index 589d4d1d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnevadosids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisnevadosids0 import harvest_cuahsihisnevadosids0 - -@job -def implnet_job_cuahsihisnevadosids0(): - harvest_cuahsihisnevadosids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnevcanids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnevcanids0.py deleted file mode 100644 index 44b3a006..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnevcanids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisnevcanids0 import harvest_cuahsihisnevcanids0 - -@job -def implnet_job_cuahsihisnevcanids0(): - harvest_cuahsihisnevcanids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnewnids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnewnids0.py deleted file mode 100644 index 506ea9c6..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnewnids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisnewnids0 import harvest_cuahsihisnewnids0 - -@job -def implnet_job_cuahsihisnewnids0(): - harvest_cuahsihisnewnids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnhgswofids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnhgswofids0.py deleted file mode 100644 index 5865499d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnhgswofids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisnhgswofids0 import harvest_cuahsihisnhgswofids0 - -@job -def implnet_job_cuahsihisnhgswofids0(): - harvest_cuahsihisnhgswofids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnooksackmicroclimatenetworkids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnooksackmicroclimatenetworkids0.py deleted file mode 100644 index 81358e29..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisnooksackmicroclimatenetworkids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisnooksackmicroclimatenetworkids0 import harvest_cuahsihisnooksackmicroclimatenetworkids0 - -@job -def implnet_job_cuahsihisnooksackmicroclimatenetworkids0(): - harvest_cuahsihisnooksackmicroclimatenetworkids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisodmkentstateids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisodmkentstateids0.py deleted file mode 100644 index 63a818e9..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisodmkentstateids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisodmkentstateids0 import harvest_cuahsihisodmkentstateids0 - -@job -def implnet_job_cuahsihisodmkentstateids0(): - harvest_cuahsihisodmkentstateids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisorsancohabids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisorsancohabids0.py deleted file mode 100644 index d2aee63d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisorsancohabids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisorsancohabids0 import harvest_cuahsihisorsancohabids0 - -@job -def implnet_job_cuahsihisorsancohabids0(): - harvest_cuahsihisorsancohabids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihispanolaodmids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihispanolaodmids0.py deleted file mode 100644 index 73672ec0..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihispanolaodmids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihispanolaodmids0 import harvest_cuahsihispanolaodmids0 - -@job -def implnet_job_cuahsihispanolaodmids0(): - harvest_cuahsihispanolaodmids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisparalanaturalezaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisparalanaturalezaids0.py deleted file mode 100644 index dcfe3c65..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisparalanaturalezaids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisparalanaturalezaids0 import harvest_cuahsihisparalanaturalezaids0 - -@job -def implnet_job_cuahsihisparalanaturalezaids0(): - harvest_cuahsihisparalanaturalezaids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisprovorivergamutids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisprovorivergamutids0.py deleted file mode 100644 index dc931cd4..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisprovorivergamutids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisprovorivergamutids0 import harvest_cuahsihisprovorivergamutids0 - -@job -def implnet_job_cuahsihisprovorivergamutids0(): - harvest_cuahsihisprovorivergamutids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisredbuttecreekgamutids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisredbuttecreekgamutids0.py deleted file mode 100644 index 6fbfc3df..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisredbuttecreekgamutids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisredbuttecreekgamutids0 import harvest_cuahsihisredbuttecreekgamutids0 - -@job -def implnet_job_cuahsihisredbuttecreekgamutids0(): - harvest_cuahsihisredbuttecreekgamutids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisrmblids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisrmblids0.py deleted file mode 100644 index 21a37e3b..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisrmblids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisrmblids0 import harvest_cuahsihisrmblids0 - -@job -def implnet_job_cuahsihisrmblids0(): - harvest_cuahsihisrmblids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihissagehencreekids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihissagehencreekids0.py deleted file mode 100644 index 4f71a81a..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihissagehencreekids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihissagehencreekids0 import harvest_cuahsihissagehencreekids0 - -@job -def implnet_job_cuahsihissagehencreekids0(): - harvest_cuahsihissagehencreekids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisscanids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisscanids0.py deleted file mode 100644 index bef484cc..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisscanids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisscanids0 import harvest_cuahsihisscanids0 - -@job -def implnet_job_cuahsihisscanids0(): - harvest_cuahsihisscanids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisscotlandnwisids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisscotlandnwisids0.py deleted file mode 100644 index 9f5605ef..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisscotlandnwisids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisscotlandnwisids0 import harvest_cuahsihisscotlandnwisids0 - -@job -def implnet_job_cuahsihisscotlandnwisids0(): - harvest_cuahsihisscotlandnwisids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisshalenetworkodmids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisshalenetworkodmids0.py deleted file mode 100644 index aa868715..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisshalenetworkodmids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisshalenetworkodmids0 import harvest_cuahsihisshalenetworkodmids0 - -@job -def implnet_job_cuahsihisshalenetworkodmids0(): - harvest_cuahsihisshalenetworkodmids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisshalenetworkodmids1.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisshalenetworkodmids1.py deleted file mode 100644 index 30acd4bb..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisshalenetworkodmids1.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisshalenetworkodmids1 import harvest_cuahsihisshalenetworkodmids1 - -@job -def implnet_job_cuahsihisshalenetworkodmids1(): - harvest_cuahsihisshalenetworkodmids1() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisskcmilltownids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisskcmilltownids0.py deleted file mode 100644 index 28d62e67..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisskcmilltownids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisskcmilltownids0 import harvest_cuahsihisskcmilltownids0 - -@job -def implnet_job_cuahsihisskcmilltownids0(): - harvest_cuahsihisskcmilltownids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihissnotelids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihissnotelids0.py deleted file mode 100644 index fc5fa17f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihissnotelids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihissnotelids0 import harvest_cuahsihissnotelids0 - -@job -def implnet_job_cuahsihissnotelids0(): - harvest_cuahsihissnotelids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisswedishmonitoringdataids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisswedishmonitoringdataids0.py deleted file mode 100644 index 919c7165..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisswedishmonitoringdataids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisswedishmonitoringdataids0 import harvest_cuahsihisswedishmonitoringdataids0 - -@job -def implnet_job_cuahsihisswedishmonitoringdataids0(): - harvest_cuahsihisswedishmonitoringdataids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistarlandwaterqualityids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistarlandwaterqualityids0.py deleted file mode 100644 index c3d808fd..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistarlandwaterqualityids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihistarlandwaterqualityids0 import harvest_cuahsihistarlandwaterqualityids0 - -@job -def implnet_job_cuahsihistarlandwaterqualityids0(): - harvest_cuahsihistarlandwaterqualityids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistncwaterdataids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistncwaterdataids0.py deleted file mode 100644 index c0e44f7d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistncwaterdataids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihistncwaterdataids0 import harvest_cuahsihistncwaterdataids0 - -@job -def implnet_job_cuahsihistncwaterdataids0(): - harvest_cuahsihistncwaterdataids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistrwaids0.py deleted file mode 100644 index 4989a99f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistrwaids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihistrwaids0 import harvest_cuahsihistrwaids0 - -@job -def implnet_job_cuahsihistrwaids0(): - harvest_cuahsihistrwaids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistuolumnemdwids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistuolumnemdwids0.py deleted file mode 100644 index 91e6881d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihistuolumnemdwids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihistuolumnemdwids0 import harvest_cuahsihistuolumnemdwids0 - -@job -def implnet_job_cuahsihistuolumnemdwids0(): - harvest_cuahsihistuolumnemdwids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisubwpadids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisubwpadids0.py deleted file mode 100644 index 0d786be8..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisubwpadids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisubwpadids0 import harvest_cuahsihisubwpadids0 - -@job -def implnet_job_cuahsihisubwpadids0(): - harvest_cuahsihisubwpadids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisumbcgwids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisumbcgwids0.py deleted file mode 100644 index 7e4113c8..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisumbcgwids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisumbcgwids0 import harvest_cuahsihisumbcgwids0 - -@job -def implnet_job_cuahsihisumbcgwids0(): - harvest_cuahsihisumbcgwids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisumbcwqids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisumbcwqids0.py deleted file mode 100644 index 848f6cbd..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisumbcwqids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisumbcwqids0 import harvest_cuahsihisumbcwqids0 - -@job -def implnet_job_cuahsihisumbcwqids0(): - harvest_cuahsihisumbcwqids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisunhsnowids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisunhsnowids0.py deleted file mode 100644 index 4a2c71f6..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisunhsnowids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisunhsnowids0 import harvest_cuahsihisunhsnowids0 - -@job -def implnet_job_cuahsihisunhsnowids0(): - harvest_cuahsihisunhsnowids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisweiherbachids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisweiherbachids0.py deleted file mode 100644 index 364a09d1..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisweiherbachids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisweiherbachids0 import harvest_cuahsihisweiherbachids0 - -@job -def implnet_job_cuahsihisweiherbachids0(): - harvest_cuahsihisweiherbachids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisyosemitehydroclimatenetworkids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisyosemitehydroclimatenetworkids0.py deleted file mode 100644 index f21cfcd0..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_cuahsihisyosemitehydroclimatenetworkids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuahsihisyosemitehydroclimatenetworkids0 import harvest_cuahsihisyosemitehydroclimatenetworkids0 - -@job -def implnet_job_cuahsihisyosemitehydroclimatenetworkids0(): - harvest_cuahsihisyosemitehydroclimatenetworkids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_dams0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_dams0.py deleted file mode 100644 index bf92c53e..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_dams0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_dams0 import harvest_dams0 - -@job -def implnet_job_dams0(): - harvest_dams0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_dams1.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_dams1.py deleted file mode 100644 index a675c6a0..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_dams1.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_dams1 import harvest_dams1 - -@job -def implnet_job_dams1(): - harvest_dams1() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_damspids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_damspids0.py deleted file mode 100644 index f0fed16e..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_damspids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_damspids0 import harvest_damspids0 - -@job -def implnet_job_damspids0(): - harvest_damspids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_demo0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_demo0.py deleted file mode 100644 index 51eb0266..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_demo0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_demo0 import harvest_demo0 - -@job -def implnet_job_demo0(): - harvest_demo0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_gfv11pois0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_gfv11pois0.py deleted file mode 100644 index a291992d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_gfv11pois0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_gfv11pois0 import harvest_gfv11pois0 - -@job -def implnet_job_gfv11pois0(): - harvest_gfv11pois0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_gfv11pois1.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_gfv11pois1.py deleted file mode 100644 index cba3692f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_gfv11pois1.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_gfv11pois1 import harvest_gfv11pois1 - -@job -def implnet_job_gfv11pois1(): - harvest_gfv11pois1() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hmw0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hmw0.py deleted file mode 100644 index 29d53172..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hmw0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_hmw0 import harvest_hmw0 - -@job -def implnet_job_hmw0(): - harvest_hmw0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hmw1.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hmw1.py deleted file mode 100644 index f9d1ad68..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hmw1.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_hmw1 import harvest_hmw1 - -@job -def implnet_job_hmw1(): - harvest_hmw1() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu020.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu020.py deleted file mode 100644 index fcde012d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu020.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_hu020 import harvest_hu020 - -@job -def implnet_job_hu020(): - harvest_hu020() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu040.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu040.py deleted file mode 100644 index 7776e00a..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu040.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_hu040 import harvest_hu040 - -@job -def implnet_job_hu040(): - harvest_hu040() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu060.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu060.py deleted file mode 100644 index fe4a44a3..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu060.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_hu060 import harvest_hu060 - -@job -def implnet_job_hu060(): - harvest_hu060() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu080.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu080.py deleted file mode 100644 index 412a1b3b..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu080.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_hu080 import harvest_hu080 - -@job -def implnet_job_hu080(): - harvest_hu080() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu100.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu100.py deleted file mode 100644 index fa812d46..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hu100.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_hu100 import harvest_hu100 - -@job -def implnet_job_hu100(): - harvest_hu100() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_huc12pp0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_huc12pp0.py deleted file mode 100644 index e498fcad..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_huc12pp0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_huc12pp0 import harvest_huc12pp0 - -@job -def implnet_job_huc12pp0(): - harvest_huc12pp0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_huc12pp1.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_huc12pp1.py deleted file mode 100644 index 47225877..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_huc12pp1.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_huc12pp1 import harvest_huc12pp1 - -@job -def implnet_job_huc12pp1(): - harvest_huc12pp1() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hydrologicunit0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hydrologicunit0.py deleted file mode 100644 index 626b412d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_hydrologicunit0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_hydrologicunit0 import harvest_hydrologicunit0 - -@job -def implnet_job_hydrologicunit0(): - harvest_hydrologicunit0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_links0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_links0.py deleted file mode 100644 index 90350f96..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_links0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_links0 import harvest_links0 - -@job -def implnet_job_links0(): - harvest_links0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_mainstems0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_mainstems0.py deleted file mode 100644 index 3e6ab4c4..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_mainstems0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_mainstems0 import harvest_mainstems0 - -@job -def implnet_job_mainstems0(): - harvest_mainstems0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nataq0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nataq0.py deleted file mode 100644 index fcac3e9c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nataq0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nataq0 import harvest_nataq0 - -@job -def implnet_job_nataq0(): - harvest_nataq0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose0.py deleted file mode 100644 index c1ab5b1f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nmwdiose0 import harvest_nmwdiose0 - -@job -def implnet_job_nmwdiose0(): - harvest_nmwdiose0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose1.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose1.py deleted file mode 100644 index 736fdf78..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose1.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nmwdiose1 import harvest_nmwdiose1 - -@job -def implnet_job_nmwdiose1(): - harvest_nmwdiose1() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose2.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose2.py deleted file mode 100644 index f2ec56e3..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose2.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nmwdiose2 import harvest_nmwdiose2 - -@job -def implnet_job_nmwdiose2(): - harvest_nmwdiose2() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose3.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose3.py deleted file mode 100644 index f7d1aefd..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose3.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nmwdiose3 import harvest_nmwdiose3 - -@job -def implnet_job_nmwdiose3(): - harvest_nmwdiose3() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose4.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose4.py deleted file mode 100644 index dd1ab409..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdiose4.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nmwdiose4 import harvest_nmwdiose4 - -@job -def implnet_job_nmwdiose4(): - harvest_nmwdiose4() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdist0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdist0.py deleted file mode 100644 index 5a3fb84b..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nmwdist0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nmwdist0 import harvest_nmwdist0 - -@job -def implnet_job_nmwdist0(): - harvest_nmwdist0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw0.py deleted file mode 100644 index 257aebff..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw0 import harvest_nwisgw0 - -@job -def implnet_job_nwisgw0(): - harvest_nwisgw0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw1.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw1.py deleted file mode 100644 index f6af1ef2..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw1.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw1 import harvest_nwisgw1 - -@job -def implnet_job_nwisgw1(): - harvest_nwisgw1() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw10.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw10.py deleted file mode 100644 index d3443419..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw10.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw10 import harvest_nwisgw10 - -@job -def implnet_job_nwisgw10(): - harvest_nwisgw10() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw11.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw11.py deleted file mode 100644 index e294a0df..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw11.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw11 import harvest_nwisgw11 - -@job -def implnet_job_nwisgw11(): - harvest_nwisgw11() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw12.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw12.py deleted file mode 100644 index 2c0cf870..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw12.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw12 import harvest_nwisgw12 - -@job -def implnet_job_nwisgw12(): - harvest_nwisgw12() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw13.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw13.py deleted file mode 100644 index 02dd0892..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw13.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw13 import harvest_nwisgw13 - -@job -def implnet_job_nwisgw13(): - harvest_nwisgw13() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw14.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw14.py deleted file mode 100644 index e0123111..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw14.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw14 import harvest_nwisgw14 - -@job -def implnet_job_nwisgw14(): - harvest_nwisgw14() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw15.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw15.py deleted file mode 100644 index 04d121eb..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw15.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw15 import harvest_nwisgw15 - -@job -def implnet_job_nwisgw15(): - harvest_nwisgw15() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw16.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw16.py deleted file mode 100644 index 2aafac7d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw16.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw16 import harvest_nwisgw16 - -@job -def implnet_job_nwisgw16(): - harvest_nwisgw16() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw17.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw17.py deleted file mode 100644 index 12533de5..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw17.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw17 import harvest_nwisgw17 - -@job -def implnet_job_nwisgw17(): - harvest_nwisgw17() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw18.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw18.py deleted file mode 100644 index 5e04df4f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw18.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw18 import harvest_nwisgw18 - -@job -def implnet_job_nwisgw18(): - harvest_nwisgw18() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw19.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw19.py deleted file mode 100644 index 5481ee6b..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw19.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw19 import harvest_nwisgw19 - -@job -def implnet_job_nwisgw19(): - harvest_nwisgw19() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw2.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw2.py deleted file mode 100644 index 107cbaf4..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw2.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw2 import harvest_nwisgw2 - -@job -def implnet_job_nwisgw2(): - harvest_nwisgw2() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw20.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw20.py deleted file mode 100644 index fc963f46..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw20.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw20 import harvest_nwisgw20 - -@job -def implnet_job_nwisgw20(): - harvest_nwisgw20() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw21.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw21.py deleted file mode 100644 index 8dcdfeb4..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw21.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw21 import harvest_nwisgw21 - -@job -def implnet_job_nwisgw21(): - harvest_nwisgw21() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw22.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw22.py deleted file mode 100644 index 6688765c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw22.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw22 import harvest_nwisgw22 - -@job -def implnet_job_nwisgw22(): - harvest_nwisgw22() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw23.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw23.py deleted file mode 100644 index 235ea9a2..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw23.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw23 import harvest_nwisgw23 - -@job -def implnet_job_nwisgw23(): - harvest_nwisgw23() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw24.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw24.py deleted file mode 100644 index 4629fa91..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw24.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw24 import harvest_nwisgw24 - -@job -def implnet_job_nwisgw24(): - harvest_nwisgw24() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw25.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw25.py deleted file mode 100644 index dafa0ec6..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw25.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw25 import harvest_nwisgw25 - -@job -def implnet_job_nwisgw25(): - harvest_nwisgw25() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw26.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw26.py deleted file mode 100644 index 4492f72f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw26.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw26 import harvest_nwisgw26 - -@job -def implnet_job_nwisgw26(): - harvest_nwisgw26() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw27.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw27.py deleted file mode 100644 index 84049cfc..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw27.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw27 import harvest_nwisgw27 - -@job -def implnet_job_nwisgw27(): - harvest_nwisgw27() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw28.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw28.py deleted file mode 100644 index e2ea0f05..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw28.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw28 import harvest_nwisgw28 - -@job -def implnet_job_nwisgw28(): - harvest_nwisgw28() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw3.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw3.py deleted file mode 100644 index 416f80a6..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw3.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw3 import harvest_nwisgw3 - -@job -def implnet_job_nwisgw3(): - harvest_nwisgw3() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw4.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw4.py deleted file mode 100644 index 95ec7076..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw4.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw4 import harvest_nwisgw4 - -@job -def implnet_job_nwisgw4(): - harvest_nwisgw4() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw5.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw5.py deleted file mode 100644 index c84cdeae..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw5.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw5 import harvest_nwisgw5 - -@job -def implnet_job_nwisgw5(): - harvest_nwisgw5() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw6.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw6.py deleted file mode 100644 index d9c3d1db..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw6.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw6 import harvest_nwisgw6 - -@job -def implnet_job_nwisgw6(): - harvest_nwisgw6() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw7.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw7.py deleted file mode 100644 index 2a77ef8e..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw7.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw7 import harvest_nwisgw7 - -@job -def implnet_job_nwisgw7(): - harvest_nwisgw7() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw8.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw8.py deleted file mode 100644 index 8f8741bd..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw8.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw8 import harvest_nwisgw8 - -@job -def implnet_job_nwisgw8(): - harvest_nwisgw8() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw9.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw9.py deleted file mode 100644 index 4cd8710a..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwisgw9.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwisgw9 import harvest_nwisgw9 - -@job -def implnet_job_nwisgw9(): - harvest_nwisgw9() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite0.py deleted file mode 100644 index 8401b1e0..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwissite0 import harvest_nwissite0 - -@job -def implnet_job_nwissite0(): - harvest_nwissite0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite1.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite1.py deleted file mode 100644 index 78009468..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite1.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwissite1 import harvest_nwissite1 - -@job -def implnet_job_nwissite1(): - harvest_nwissite1() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite2.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite2.py deleted file mode 100644 index 8e7c759c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite2.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwissite2 import harvest_nwissite2 - -@job -def implnet_job_nwissite2(): - harvest_nwissite2() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite3.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite3.py deleted file mode 100644 index 3106e4b7..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_nwissite3.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nwissite3 import harvest_nwissite3 - -@job -def implnet_job_nwissite3(): - harvest_nwissite3() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_places0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_places0.py deleted file mode 100644 index 865efac7..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_places0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_places0 import harvest_places0 - -@job -def implnet_job_places0(): - harvest_places0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_princiaq0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_princiaq0.py deleted file mode 100644 index 3dda69ff..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_princiaq0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_princiaq0 import harvest_princiaq0 - -@job -def implnet_job_princiaq0(): - harvest_princiaq0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_pws0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_pws0.py deleted file mode 100644 index 4126696f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_pws0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_pws0 import harvest_pws0 - -@job -def implnet_job_pws0(): - harvest_pws0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage0.py deleted file mode 100644 index 46607415..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_refgage0 import harvest_refgage0 - -@job -def implnet_job_refgage0(): - harvest_refgage0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage1.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage1.py deleted file mode 100644 index 2dcdcab4..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage1.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_refgage1 import harvest_refgage1 - -@job -def implnet_job_refgage1(): - harvest_refgage1() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage2.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage2.py deleted file mode 100644 index b9bc1a4c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage2.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_refgage2 import harvest_refgage2 - -@job -def implnet_job_refgage2(): - harvest_refgage2() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage3.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage3.py deleted file mode 100644 index 870cc1f2..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_refgage3.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_refgage3 import harvest_refgage3 - -@job -def implnet_job_refgage3(): - harvest_refgage3() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_rise0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_rise0.py deleted file mode 100644 index 7f5dd685..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_rise0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_rise0 import harvest_rise0 - -@job -def implnet_job_rise0(): - harvest_rise0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_sechydrgreg0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_sechydrgreg0.py deleted file mode 100644 index 6747ad00..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_sechydrgreg0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_sechydrgreg0 import harvest_sechydrgreg0 - -@job -def implnet_job_sechydrgreg0(): - harvest_sechydrgreg0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_selfieids0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_selfieids0.py deleted file mode 100644 index b4a6da97..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_selfieids0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_selfieids0 import harvest_selfieids0 - -@job -def implnet_job_selfieids0(): - harvest_selfieids0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_states0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_states0.py deleted file mode 100644 index e40f041a..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_states0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_states0 import harvest_states0 - -@job -def implnet_job_states0(): - harvest_states0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_ua100.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_ua100.py deleted file mode 100644 index 159f619e..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_ua100.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_ua100 import harvest_ua100 - -@job -def implnet_job_ua100(): - harvest_ua100() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade0.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade0.py deleted file mode 100644 index d188b950..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade0.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_wade0 import harvest_wade0 - -@job -def implnet_job_wade0(): - harvest_wade0() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade1.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade1.py deleted file mode 100644 index 725a3a0a..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade1.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_wade1 import harvest_wade1 - -@job -def implnet_job_wade1(): - harvest_wade1() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade10.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade10.py deleted file mode 100644 index fee08238..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade10.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_wade10 import harvest_wade10 - -@job -def implnet_job_wade10(): - harvest_wade10() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade11.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade11.py deleted file mode 100644 index ba521353..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade11.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_wade11 import harvest_wade11 - -@job -def implnet_job_wade11(): - harvest_wade11() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade12.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade12.py deleted file mode 100644 index bc4e6108..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade12.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_wade12 import harvest_wade12 - -@job -def implnet_job_wade12(): - harvest_wade12() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade13.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade13.py deleted file mode 100644 index 291f8c66..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade13.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_wade13 import harvest_wade13 - -@job -def implnet_job_wade13(): - harvest_wade13() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade14.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade14.py deleted file mode 100644 index 955c68e6..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade14.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_wade14 import harvest_wade14 - -@job -def implnet_job_wade14(): - harvest_wade14() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade15.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade15.py deleted file mode 100644 index 5e982edf..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade15.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_wade15 import harvest_wade15 - -@job -def implnet_job_wade15(): - harvest_wade15() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade16.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade16.py deleted file mode 100644 index 2f54ea43..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade16.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_wade16 import harvest_wade16 - -@job -def implnet_job_wade16(): - harvest_wade16() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade17.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade17.py deleted file mode 100644 index 18c747f2..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade17.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_wade17 import harvest_wade17 - -@job -def implnet_job_wade17(): - harvest_wade17() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade18.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade18.py deleted file mode 100644 index 5bd5ce6f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade18.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_wade18 import harvest_wade18 - -@job -def implnet_job_wade18(): - harvest_wade18() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade19.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade19.py deleted file mode 100644 index 1641d82a..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade19.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_wade19 import harvest_wade19 - -@job -def implnet_job_wade19(): - harvest_wade19() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade2.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade2.py deleted file mode 100644 index 0fda70a7..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade2.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_wade2 import harvest_wade2 - -@job -def implnet_job_wade2(): - harvest_wade2() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade3.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade3.py deleted file mode 100644 index bbb207e3..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade3.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_wade3 import harvest_wade3 - -@job -def implnet_job_wade3(): - harvest_wade3() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade4.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade4.py deleted file mode 100644 index 63e92ddb..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade4.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_wade4 import harvest_wade4 - -@job -def implnet_job_wade4(): - harvest_wade4() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade5.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade5.py deleted file mode 100644 index 74c0c863..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade5.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_wade5 import harvest_wade5 - -@job -def implnet_job_wade5(): - harvest_wade5() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade6.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade6.py deleted file mode 100644 index 44732ec2..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade6.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_wade6 import harvest_wade6 - -@job -def implnet_job_wade6(): - harvest_wade6() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade7.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade7.py deleted file mode 100644 index 2825e5da..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade7.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_wade7 import harvest_wade7 - -@job -def implnet_job_wade7(): - harvest_wade7() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade8.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade8.py deleted file mode 100644 index ec6a046e..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade8.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_wade8 import harvest_wade8 - -@job -def implnet_job_wade8(): - harvest_wade8() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade9.py b/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade9.py deleted file mode 100644 index ed099b32..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/jobs/implnet_jobs_wade9.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_wade9 import harvest_wade9 - -@job -def implnet_job_wade9(): - harvest_wade9() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_aiannh0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_aiannh0.py deleted file mode 100644 index 8ef7b64a..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_aiannh0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def aiannh0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def aiannh0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "aiannh0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def aiannh0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "aiannh0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def aiannh0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "aiannh0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def aiannh0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "aiannh0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def aiannh0_naburelease(context): - returned_value = gleanerio(context,("release"), "aiannh0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def aiannh0_uploadrelease(context): - returned_value = postRelease("aiannh0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def aiannh0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="aiannh0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "aiannh0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def aiannh0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="aiannh0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "aiannh0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def aiannh0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="aiannh0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "aiannh0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def aiannh0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="aiannh0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "aiannh0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def aiannh0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "aiannh0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="aiannh0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="aiannh0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_aiannh0(): - containers = aiannh0_getImage() - harvest = aiannh0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = aiannh0_missingreport_s3(start=harvest) - report_idstat = aiannh0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = aiannh0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="aiannh0") - load_release = aiannh0_naburelease(start=harvest) - load_uploadrelease = aiannh0_uploadrelease(start=load_release) - - load_prune = aiannh0_nabu_prune(start=load_uploadrelease) - load_prov = aiannh0_nabuprov(start=load_prune) - load_org = aiannh0_nabuorg(start=load_prov) - -# run after load - report_msgraph=aiannh0_missingreport_graph(start=load_org) - report_graph=aiannh0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_autotest10.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_autotest10.py deleted file mode 100644 index 6d0e34d6..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_autotest10.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def autotest10_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def autotest10_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "autotest10") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def autotest10_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "autotest10") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def autotest10_nabuprov(context): - returned_value = gleanerio(context,("prov"), "autotest10") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def autotest10_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "autotest10") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def autotest10_naburelease(context): - returned_value = gleanerio(context,("release"), "autotest10") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def autotest10_uploadrelease(context): - returned_value = postRelease("autotest10") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def autotest10_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="autotest10") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "autotest10" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def autotest10_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="autotest10") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "autotest10" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def autotest10_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="autotest10") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "autotest10" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def autotest10_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="autotest10") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "autotest10" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def autotest10_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "autotest10" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="autotest10"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="autotest10" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_autotest10(): - containers = autotest10_getImage() - harvest = autotest10_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = autotest10_missingreport_s3(start=harvest) - report_idstat = autotest10_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = autotest10_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="autotest10") - load_release = autotest10_naburelease(start=harvest) - load_uploadrelease = autotest10_uploadrelease(start=load_release) - - load_prune = autotest10_nabu_prune(start=load_uploadrelease) - load_prov = autotest10_nabuprov(start=load_prune) - load_org = autotest10_nabuorg(start=load_prov) - -# run after load - report_msgraph=autotest10_missingreport_graph(start=load_org) - report_graph=autotest10_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_autotest20.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_autotest20.py deleted file mode 100644 index 52b05a70..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_autotest20.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def autotest20_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def autotest20_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "autotest20") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def autotest20_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "autotest20") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def autotest20_nabuprov(context): - returned_value = gleanerio(context,("prov"), "autotest20") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def autotest20_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "autotest20") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def autotest20_naburelease(context): - returned_value = gleanerio(context,("release"), "autotest20") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def autotest20_uploadrelease(context): - returned_value = postRelease("autotest20") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def autotest20_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="autotest20") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "autotest20" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def autotest20_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="autotest20") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "autotest20" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def autotest20_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="autotest20") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "autotest20" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def autotest20_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="autotest20") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "autotest20" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def autotest20_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "autotest20" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="autotest20"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="autotest20" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_autotest20(): - containers = autotest20_getImage() - harvest = autotest20_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = autotest20_missingreport_s3(start=harvest) - report_idstat = autotest20_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = autotest20_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="autotest20") - load_release = autotest20_naburelease(start=harvest) - load_uploadrelease = autotest20_uploadrelease(start=load_release) - - load_prune = autotest20_nabu_prune(start=load_uploadrelease) - load_prov = autotest20_nabuprov(start=load_prune) - load_org = autotest20_nabuorg(start=load_prov) - -# run after load - report_msgraph=autotest20_missingreport_graph(start=load_org) - report_graph=autotest20_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cagagespids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cagagespids0.py deleted file mode 100644 index e8b52d7f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cagagespids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cagagespids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cagagespids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cagagespids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cagagespids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cagagespids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cagagespids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cagagespids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cagagespids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cagagespids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cagagespids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cagagespids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cagagespids0_uploadrelease(context): - returned_value = postRelease("cagagespids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cagagespids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cagagespids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cagagespids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cagagespids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cagagespids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cagagespids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cagagespids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cagagespids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cagagespids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cagagespids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cagagespids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cagagespids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cagagespids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cagagespids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cagagespids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cagagespids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cagagespids0(): - containers = cagagespids0_getImage() - harvest = cagagespids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cagagespids0_missingreport_s3(start=harvest) - report_idstat = cagagespids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cagagespids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cagagespids0") - load_release = cagagespids0_naburelease(start=harvest) - load_uploadrelease = cagagespids0_uploadrelease(start=load_release) - - load_prune = cagagespids0_nabu_prune(start=load_uploadrelease) - load_prov = cagagespids0_nabuprov(start=load_prune) - load_org = cagagespids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cagagespids0_missingreport_graph(start=load_org) - report_graph=cagagespids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cbsa0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cbsa0.py deleted file mode 100644 index fd71d2dc..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cbsa0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cbsa0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cbsa0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cbsa0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cbsa0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cbsa0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cbsa0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cbsa0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cbsa0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cbsa0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cbsa0_naburelease(context): - returned_value = gleanerio(context,("release"), "cbsa0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cbsa0_uploadrelease(context): - returned_value = postRelease("cbsa0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cbsa0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cbsa0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cbsa0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cbsa0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cbsa0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cbsa0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cbsa0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cbsa0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cbsa0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cbsa0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cbsa0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cbsa0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cbsa0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cbsa0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cbsa0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cbsa0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cbsa0(): - containers = cbsa0_getImage() - harvest = cbsa0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cbsa0_missingreport_s3(start=harvest) - report_idstat = cbsa0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cbsa0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cbsa0") - load_release = cbsa0_naburelease(start=harvest) - load_uploadrelease = cbsa0_uploadrelease(start=load_release) - - load_prune = cbsa0_nabu_prune(start=load_uploadrelease) - load_prov = cbsa0_nabuprov(start=load_prune) - load_org = cbsa0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cbsa0_missingreport_graph(start=load_org) - report_graph=cbsa0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_chyldpilotids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_chyldpilotids0.py deleted file mode 100644 index b22c1575..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_chyldpilotids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def chyldpilotids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def chyldpilotids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "chyldpilotids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def chyldpilotids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "chyldpilotids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def chyldpilotids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "chyldpilotids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def chyldpilotids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "chyldpilotids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def chyldpilotids0_naburelease(context): - returned_value = gleanerio(context,("release"), "chyldpilotids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def chyldpilotids0_uploadrelease(context): - returned_value = postRelease("chyldpilotids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def chyldpilotids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="chyldpilotids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "chyldpilotids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def chyldpilotids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="chyldpilotids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "chyldpilotids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def chyldpilotids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="chyldpilotids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "chyldpilotids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def chyldpilotids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="chyldpilotids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "chyldpilotids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def chyldpilotids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "chyldpilotids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="chyldpilotids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="chyldpilotids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_chyldpilotids0(): - containers = chyldpilotids0_getImage() - harvest = chyldpilotids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = chyldpilotids0_missingreport_s3(start=harvest) - report_idstat = chyldpilotids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = chyldpilotids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="chyldpilotids0") - load_release = chyldpilotids0_naburelease(start=harvest) - load_uploadrelease = chyldpilotids0_uploadrelease(start=load_release) - - load_prune = chyldpilotids0_nabu_prune(start=load_uploadrelease) - load_prov = chyldpilotids0_nabuprov(start=load_prune) - load_org = chyldpilotids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=chyldpilotids0_missingreport_graph(start=load_org) - report_graph=chyldpilotids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_counties0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_counties0.py deleted file mode 100644 index af125a85..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_counties0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I think this happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def counties0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def counties0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "counties0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def counties0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "counties0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def counties0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "counties0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def counties0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "counties0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def counties0_naburelease(context): - returned_value = gleanerio(context,("release"), "counties0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def counties0_uploadrelease(context): - returned_value = postRelease("counties0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def counties0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="counties0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "counties0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def counties0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="counties0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "counties0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def counties0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="counties0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "counties0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def counties0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="counties0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "counties0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def counties0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "counties0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="counties0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="counties0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_counties0(): - containers = counties0_getImage() - harvest = counties0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = counties0_missingreport_s3(start=harvest) - report_idstat = counties0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = counties0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="counties0") - load_release = counties0_naburelease(start=harvest) - load_uploadrelease = counties0_uploadrelease(start=load_release) - - load_prune = counties0_nabu_prune(start=load_uploadrelease) - load_prov = counties0_nabuprov(start=load_prune) - load_org = counties0_nabuorg(start=load_prov) - -# run after load - report_msgraph=counties0_missingreport_graph(start=load_org) - report_graph=counties0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisandrewsforestlterids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisandrewsforestlterids0.py deleted file mode 100644 index c64c8a7a..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisandrewsforestlterids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisandrewsforestlterids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisandrewsforestlterids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisandrewsforestlterids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisandrewsforestlterids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisandrewsforestlterids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisandrewsforestlterids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisandrewsforestlterids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisandrewsforestlterids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisandrewsforestlterids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisandrewsforestlterids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisandrewsforestlterids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisandrewsforestlterids0_uploadrelease(context): - returned_value = postRelease("cuahsihisandrewsforestlterids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisandrewsforestlterids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisandrewsforestlterids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisandrewsforestlterids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisandrewsforestlterids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisandrewsforestlterids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisandrewsforestlterids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisandrewsforestlterids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisandrewsforestlterids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisandrewsforestlterids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisandrewsforestlterids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisandrewsforestlterids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisandrewsforestlterids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisandrewsforestlterids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisandrewsforestlterids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisandrewsforestlterids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisandrewsforestlterids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisandrewsforestlterids0(): - containers = cuahsihisandrewsforestlterids0_getImage() - harvest = cuahsihisandrewsforestlterids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisandrewsforestlterids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisandrewsforestlterids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisandrewsforestlterids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisandrewsforestlterids0") - load_release = cuahsihisandrewsforestlterids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisandrewsforestlterids0_uploadrelease(start=load_release) - - load_prune = cuahsihisandrewsforestlterids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisandrewsforestlterids0_nabuprov(start=load_prune) - load_org = cuahsihisandrewsforestlterids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisandrewsforestlterids0_missingreport_graph(start=load_org) - report_graph=cuahsihisandrewsforestlterids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisbrazilucbids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisbrazilucbids0.py deleted file mode 100644 index 29a7b92e..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisbrazilucbids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisbrazilucbids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisbrazilucbids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisbrazilucbids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisbrazilucbids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisbrazilucbids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisbrazilucbids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisbrazilucbids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisbrazilucbids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisbrazilucbids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisbrazilucbids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisbrazilucbids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisbrazilucbids0_uploadrelease(context): - returned_value = postRelease("cuahsihisbrazilucbids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisbrazilucbids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisbrazilucbids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisbrazilucbids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisbrazilucbids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisbrazilucbids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisbrazilucbids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisbrazilucbids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisbrazilucbids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisbrazilucbids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisbrazilucbids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisbrazilucbids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisbrazilucbids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisbrazilucbids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisbrazilucbids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisbrazilucbids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisbrazilucbids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisbrazilucbids0(): - containers = cuahsihisbrazilucbids0_getImage() - harvest = cuahsihisbrazilucbids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisbrazilucbids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisbrazilucbids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisbrazilucbids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisbrazilucbids0") - load_release = cuahsihisbrazilucbids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisbrazilucbids0_uploadrelease(start=load_release) - - load_prune = cuahsihisbrazilucbids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisbrazilucbids0_nabuprov(start=load_prune) - load_org = cuahsihisbrazilucbids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisbrazilucbids0_missingreport_graph(start=load_org) - report_graph=cuahsihisbrazilucbids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscalvinhhsids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscalvinhhsids0.py deleted file mode 100644 index 338a9356..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscalvinhhsids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihiscalvinhhsids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihiscalvinhhsids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihiscalvinhhsids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscalvinhhsids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihiscalvinhhsids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscalvinhhsids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihiscalvinhhsids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscalvinhhsids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihiscalvinhhsids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscalvinhhsids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihiscalvinhhsids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihiscalvinhhsids0_uploadrelease(context): - returned_value = postRelease("cuahsihiscalvinhhsids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihiscalvinhhsids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscalvinhhsids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscalvinhhsids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihiscalvinhhsids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscalvinhhsids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscalvinhhsids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihiscalvinhhsids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscalvinhhsids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscalvinhhsids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscalvinhhsids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscalvinhhsids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscalvinhhsids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscalvinhhsids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscalvinhhsids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihiscalvinhhsids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihiscalvinhhsids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihiscalvinhhsids0(): - containers = cuahsihiscalvinhhsids0_getImage() - harvest = cuahsihiscalvinhhsids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihiscalvinhhsids0_missingreport_s3(start=harvest) - report_idstat = cuahsihiscalvinhhsids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihiscalvinhhsids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihiscalvinhhsids0") - load_release = cuahsihiscalvinhhsids0_naburelease(start=harvest) - load_uploadrelease = cuahsihiscalvinhhsids0_uploadrelease(start=load_release) - - load_prune = cuahsihiscalvinhhsids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihiscalvinhhsids0_nabuprov(start=load_prune) - load_org = cuahsihiscalvinhhsids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihiscalvinhhsids0_missingreport_graph(start=load_org) - report_graph=cuahsihiscalvinhhsids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisccbepdapids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisccbepdapids0.py deleted file mode 100644 index 97fbc12b..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisccbepdapids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisccbepdapids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisccbepdapids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisccbepdapids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisccbepdapids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisccbepdapids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisccbepdapids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisccbepdapids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisccbepdapids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisccbepdapids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisccbepdapids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisccbepdapids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisccbepdapids0_uploadrelease(context): - returned_value = postRelease("cuahsihisccbepdapids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisccbepdapids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisccbepdapids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisccbepdapids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisccbepdapids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisccbepdapids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisccbepdapids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisccbepdapids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisccbepdapids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisccbepdapids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisccbepdapids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisccbepdapids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisccbepdapids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisccbepdapids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisccbepdapids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisccbepdapids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisccbepdapids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisccbepdapids0(): - containers = cuahsihisccbepdapids0_getImage() - harvest = cuahsihisccbepdapids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisccbepdapids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisccbepdapids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisccbepdapids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisccbepdapids0") - load_release = cuahsihisccbepdapids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisccbepdapids0_uploadrelease(start=load_release) - - load_prune = cuahsihisccbepdapids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisccbepdapids0_nabuprov(start=load_prune) - load_org = cuahsihisccbepdapids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisccbepdapids0_missingreport_graph(start=load_org) - report_graph=cuahsihisccbepdapids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscedarriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscedarriverids0.py deleted file mode 100644 index 1fc61da6..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscedarriverids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihiscedarriverids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihiscedarriverids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihiscedarriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscedarriverids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihiscedarriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscedarriverids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihiscedarriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscedarriverids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihiscedarriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscedarriverids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihiscedarriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihiscedarriverids0_uploadrelease(context): - returned_value = postRelease("cuahsihiscedarriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihiscedarriverids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscedarriverids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscedarriverids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihiscedarriverids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscedarriverids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscedarriverids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihiscedarriverids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscedarriverids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscedarriverids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscedarriverids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscedarriverids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscedarriverids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscedarriverids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscedarriverids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihiscedarriverids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihiscedarriverids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihiscedarriverids0(): - containers = cuahsihiscedarriverids0_getImage() - harvest = cuahsihiscedarriverids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihiscedarriverids0_missingreport_s3(start=harvest) - report_idstat = cuahsihiscedarriverids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihiscedarriverids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihiscedarriverids0") - load_release = cuahsihiscedarriverids0_naburelease(start=harvest) - load_uploadrelease = cuahsihiscedarriverids0_uploadrelease(start=load_release) - - load_prune = cuahsihiscedarriverids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihiscedarriverids0_nabuprov(start=load_prune) - load_org = cuahsihiscedarriverids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihiscedarriverids0_missingreport_graph(start=load_org) - report_graph=cuahsihiscedarriverids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisclarksburgspids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisclarksburgspids0.py deleted file mode 100644 index 4460f887..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisclarksburgspids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisclarksburgspids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisclarksburgspids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisclarksburgspids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisclarksburgspids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisclarksburgspids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisclarksburgspids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisclarksburgspids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisclarksburgspids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisclarksburgspids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisclarksburgspids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisclarksburgspids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisclarksburgspids0_uploadrelease(context): - returned_value = postRelease("cuahsihisclarksburgspids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisclarksburgspids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisclarksburgspids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisclarksburgspids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisclarksburgspids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisclarksburgspids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisclarksburgspids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisclarksburgspids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisclarksburgspids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisclarksburgspids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisclarksburgspids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisclarksburgspids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisclarksburgspids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisclarksburgspids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisclarksburgspids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisclarksburgspids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisclarksburgspids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisclarksburgspids0(): - containers = cuahsihisclarksburgspids0_getImage() - harvest = cuahsihisclarksburgspids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisclarksburgspids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisclarksburgspids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisclarksburgspids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisclarksburgspids0") - load_release = cuahsihisclarksburgspids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisclarksburgspids0_uploadrelease(start=load_release) - - load_prune = cuahsihisclarksburgspids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisclarksburgspids0_nabuprov(start=load_prune) - load_org = cuahsihisclarksburgspids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisclarksburgspids0_missingreport_graph(start=load_org) - report_graph=cuahsihisclarksburgspids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscocorahsids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscocorahsids0.py deleted file mode 100644 index de371636..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscocorahsids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihiscocorahsids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihiscocorahsids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihiscocorahsids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscocorahsids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihiscocorahsids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscocorahsids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihiscocorahsids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscocorahsids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihiscocorahsids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscocorahsids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihiscocorahsids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihiscocorahsids0_uploadrelease(context): - returned_value = postRelease("cuahsihiscocorahsids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihiscocorahsids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscocorahsids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscocorahsids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihiscocorahsids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscocorahsids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscocorahsids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihiscocorahsids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscocorahsids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscocorahsids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscocorahsids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscocorahsids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscocorahsids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscocorahsids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscocorahsids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihiscocorahsids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihiscocorahsids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihiscocorahsids0(): - containers = cuahsihiscocorahsids0_getImage() - harvest = cuahsihiscocorahsids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihiscocorahsids0_missingreport_s3(start=harvest) - report_idstat = cuahsihiscocorahsids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihiscocorahsids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihiscocorahsids0") - load_release = cuahsihiscocorahsids0_naburelease(start=harvest) - load_uploadrelease = cuahsihiscocorahsids0_uploadrelease(start=load_release) - - load_prune = cuahsihiscocorahsids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihiscocorahsids0_nabuprov(start=load_prune) - load_org = cuahsihiscocorahsids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihiscocorahsids0_missingreport_graph(start=load_org) - report_graph=cuahsihiscocorahsids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscrwaids0.py deleted file mode 100644 index c99b9036..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscrwaids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihiscrwaids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihiscrwaids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihiscrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscrwaids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihiscrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscrwaids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihiscrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscrwaids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihiscrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscrwaids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihiscrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihiscrwaids0_uploadrelease(context): - returned_value = postRelease("cuahsihiscrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihiscrwaids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscrwaids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscrwaids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihiscrwaids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscrwaids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscrwaids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihiscrwaids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscrwaids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscrwaids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscrwaids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscrwaids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscrwaids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscrwaids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscrwaids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihiscrwaids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihiscrwaids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihiscrwaids0(): - containers = cuahsihiscrwaids0_getImage() - harvest = cuahsihiscrwaids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihiscrwaids0_missingreport_s3(start=harvest) - report_idstat = cuahsihiscrwaids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihiscrwaids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihiscrwaids0") - load_release = cuahsihiscrwaids0_naburelease(start=harvest) - load_uploadrelease = cuahsihiscrwaids0_uploadrelease(start=load_release) - - load_prune = cuahsihiscrwaids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihiscrwaids0_nabuprov(start=load_prune) - load_org = cuahsihiscrwaids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihiscrwaids0_missingreport_graph(start=load_org) - report_graph=cuahsihiscrwaids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscuisoids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscuisoids0.py deleted file mode 100644 index 44eee490..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiscuisoids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihiscuisoids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihiscuisoids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihiscuisoids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscuisoids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihiscuisoids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscuisoids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihiscuisoids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscuisoids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihiscuisoids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscuisoids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihiscuisoids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihiscuisoids0_uploadrelease(context): - returned_value = postRelease("cuahsihiscuisoids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihiscuisoids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscuisoids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscuisoids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihiscuisoids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscuisoids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscuisoids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihiscuisoids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscuisoids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscuisoids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscuisoids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiscuisoids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscuisoids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiscuisoids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiscuisoids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihiscuisoids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihiscuisoids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihiscuisoids0(): - containers = cuahsihiscuisoids0_getImage() - harvest = cuahsihiscuisoids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihiscuisoids0_missingreport_s3(start=harvest) - report_idstat = cuahsihiscuisoids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihiscuisoids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihiscuisoids0") - load_release = cuahsihiscuisoids0_naburelease(start=harvest) - load_uploadrelease = cuahsihiscuisoids0_uploadrelease(start=load_release) - - load_prune = cuahsihiscuisoids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihiscuisoids0_nabuprov(start=load_prune) - load_org = cuahsihiscuisoids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihiscuisoids0_missingreport_graph(start=load_org) - report_graph=cuahsihiscuisoids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoarizids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoarizids0.py deleted file mode 100644 index aab39321..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoarizids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisczoarizids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisczoarizids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisczoarizids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoarizids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisczoarizids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoarizids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisczoarizids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoarizids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisczoarizids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoarizids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisczoarizids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisczoarizids0_uploadrelease(context): - returned_value = postRelease("cuahsihisczoarizids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoarizids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoarizids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczoarizids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisczoarizids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoarizids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczoarizids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisczoarizids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoarizids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczoarizids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoarizids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoarizids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczoarizids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoarizids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczoarizids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisczoarizids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisczoarizids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisczoarizids0(): - containers = cuahsihisczoarizids0_getImage() - harvest = cuahsihisczoarizids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisczoarizids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisczoarizids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisczoarizids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisczoarizids0") - load_release = cuahsihisczoarizids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisczoarizids0_uploadrelease(start=load_release) - - load_prune = cuahsihisczoarizids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisczoarizids0_nabuprov(start=load_prune) - load_org = cuahsihisczoarizids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisczoarizids0_missingreport_graph(start=load_org) - report_graph=cuahsihisczoarizids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoboulderids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoboulderids0.py deleted file mode 100644 index 30b03230..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoboulderids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisczoboulderids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisczoboulderids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisczoboulderids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoboulderids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisczoboulderids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoboulderids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisczoboulderids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoboulderids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisczoboulderids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoboulderids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisczoboulderids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisczoboulderids0_uploadrelease(context): - returned_value = postRelease("cuahsihisczoboulderids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoboulderids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoboulderids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczoboulderids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisczoboulderids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoboulderids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczoboulderids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisczoboulderids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoboulderids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczoboulderids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoboulderids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoboulderids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczoboulderids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoboulderids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczoboulderids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisczoboulderids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisczoboulderids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisczoboulderids0(): - containers = cuahsihisczoboulderids0_getImage() - harvest = cuahsihisczoboulderids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisczoboulderids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisczoboulderids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisczoboulderids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisczoboulderids0") - load_release = cuahsihisczoboulderids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisczoboulderids0_uploadrelease(start=load_release) - - load_prune = cuahsihisczoboulderids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisczoboulderids0_nabuprov(start=load_prune) - load_org = cuahsihisczoboulderids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisczoboulderids0_missingreport_graph(start=load_org) - report_graph=cuahsihisczoboulderids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczocatalinaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczocatalinaids0.py deleted file mode 100644 index 584b550d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczocatalinaids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisczocatalinaids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisczocatalinaids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisczocatalinaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczocatalinaids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisczocatalinaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczocatalinaids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisczocatalinaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczocatalinaids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisczocatalinaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczocatalinaids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisczocatalinaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisczocatalinaids0_uploadrelease(context): - returned_value = postRelease("cuahsihisczocatalinaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisczocatalinaids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczocatalinaids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczocatalinaids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisczocatalinaids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczocatalinaids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczocatalinaids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisczocatalinaids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczocatalinaids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczocatalinaids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczocatalinaids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczocatalinaids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczocatalinaids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczocatalinaids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczocatalinaids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisczocatalinaids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisczocatalinaids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisczocatalinaids0(): - containers = cuahsihisczocatalinaids0_getImage() - harvest = cuahsihisczocatalinaids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisczocatalinaids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisczocatalinaids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisczocatalinaids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisczocatalinaids0") - load_release = cuahsihisczocatalinaids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisczocatalinaids0_uploadrelease(start=load_release) - - load_prune = cuahsihisczocatalinaids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisczocatalinaids0_nabuprov(start=load_prune) - load_org = cuahsihisczocatalinaids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisczocatalinaids0_missingreport_graph(start=load_org) - report_graph=cuahsihisczocatalinaids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoluquilloids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoluquilloids0.py deleted file mode 100644 index efc746e3..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoluquilloids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisczoluquilloids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisczoluquilloids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisczoluquilloids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoluquilloids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisczoluquilloids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoluquilloids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisczoluquilloids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoluquilloids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisczoluquilloids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoluquilloids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisczoluquilloids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisczoluquilloids0_uploadrelease(context): - returned_value = postRelease("cuahsihisczoluquilloids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoluquilloids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoluquilloids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczoluquilloids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisczoluquilloids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoluquilloids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczoluquilloids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisczoluquilloids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoluquilloids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczoluquilloids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoluquilloids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoluquilloids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczoluquilloids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoluquilloids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczoluquilloids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisczoluquilloids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisczoluquilloids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisczoluquilloids0(): - containers = cuahsihisczoluquilloids0_getImage() - harvest = cuahsihisczoluquilloids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisczoluquilloids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisczoluquilloids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisczoluquilloids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisczoluquilloids0") - load_release = cuahsihisczoluquilloids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisczoluquilloids0_uploadrelease(start=load_release) - - load_prune = cuahsihisczoluquilloids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisczoluquilloids0_nabuprov(start=load_prune) - load_org = cuahsihisczoluquilloids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisczoluquilloids0_missingreport_graph(start=load_org) - report_graph=cuahsihisczoluquilloids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczomercedids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczomercedids0.py deleted file mode 100644 index 7246c604..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczomercedids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisczomercedids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisczomercedids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisczomercedids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczomercedids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisczomercedids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczomercedids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisczomercedids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczomercedids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisczomercedids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczomercedids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisczomercedids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisczomercedids0_uploadrelease(context): - returned_value = postRelease("cuahsihisczomercedids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisczomercedids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczomercedids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczomercedids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisczomercedids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczomercedids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczomercedids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisczomercedids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczomercedids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczomercedids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczomercedids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczomercedids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczomercedids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczomercedids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczomercedids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisczomercedids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisczomercedids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisczomercedids0(): - containers = cuahsihisczomercedids0_getImage() - harvest = cuahsihisczomercedids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisczomercedids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisczomercedids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisczomercedids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisczomercedids0") - load_release = cuahsihisczomercedids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisczomercedids0_uploadrelease(start=load_release) - - load_prune = cuahsihisczomercedids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisczomercedids0_nabuprov(start=load_prune) - load_org = cuahsihisczomercedids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisczomercedids0_missingreport_graph(start=load_org) - report_graph=cuahsihisczomercedids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczopsuids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczopsuids0.py deleted file mode 100644 index 7d419530..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczopsuids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisczopsuids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisczopsuids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisczopsuids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczopsuids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisczopsuids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczopsuids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisczopsuids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczopsuids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisczopsuids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczopsuids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisczopsuids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisczopsuids0_uploadrelease(context): - returned_value = postRelease("cuahsihisczopsuids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisczopsuids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczopsuids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczopsuids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisczopsuids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczopsuids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczopsuids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisczopsuids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczopsuids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczopsuids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczopsuids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczopsuids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczopsuids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczopsuids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczopsuids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisczopsuids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisczopsuids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisczopsuids0(): - containers = cuahsihisczopsuids0_getImage() - harvest = cuahsihisczopsuids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisczopsuids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisczopsuids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisczopsuids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisczopsuids0") - load_release = cuahsihisczopsuids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisczopsuids0_uploadrelease(start=load_release) - - load_prune = cuahsihisczopsuids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisczopsuids0_nabuprov(start=load_prune) - load_org = cuahsihisczopsuids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisczopsuids0_missingreport_graph(start=load_org) - report_graph=cuahsihisczopsuids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoudelids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoudelids0.py deleted file mode 100644 index b9ee4947..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisczoudelids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisczoudelids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisczoudelids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisczoudelids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoudelids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisczoudelids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoudelids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisczoudelids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoudelids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisczoudelids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoudelids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisczoudelids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisczoudelids0_uploadrelease(context): - returned_value = postRelease("cuahsihisczoudelids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoudelids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoudelids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczoudelids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisczoudelids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoudelids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczoudelids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisczoudelids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoudelids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczoudelids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoudelids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisczoudelids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczoudelids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisczoudelids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisczoudelids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisczoudelids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisczoudelids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisczoudelids0(): - containers = cuahsihisczoudelids0_getImage() - harvest = cuahsihisczoudelids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisczoudelids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisczoudelids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisczoudelids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisczoudelids0") - load_release = cuahsihisczoudelids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisczoudelids0_uploadrelease(start=load_release) - - load_prune = cuahsihisczoudelids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisczoudelids0_nabuprov(start=load_prune) - load_org = cuahsihisczoudelids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisczoudelids0_missingreport_graph(start=load_org) - report_graph=cuahsihisczoudelids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisdrwiids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisdrwiids0.py deleted file mode 100644 index 7813edac..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisdrwiids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisdrwiids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisdrwiids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisdrwiids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisdrwiids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisdrwiids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisdrwiids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisdrwiids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisdrwiids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisdrwiids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisdrwiids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisdrwiids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisdrwiids0_uploadrelease(context): - returned_value = postRelease("cuahsihisdrwiids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisdrwiids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisdrwiids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisdrwiids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisdrwiids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisdrwiids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisdrwiids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisdrwiids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisdrwiids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisdrwiids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisdrwiids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisdrwiids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisdrwiids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisdrwiids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisdrwiids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisdrwiids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisdrwiids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisdrwiids0(): - containers = cuahsihisdrwiids0_getImage() - harvest = cuahsihisdrwiids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisdrwiids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisdrwiids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisdrwiids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisdrwiids0") - load_release = cuahsihisdrwiids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisdrwiids0_uploadrelease(start=load_release) - - load_prune = cuahsihisdrwiids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisdrwiids0_nabuprov(start=load_prune) - load_org = cuahsihisdrwiids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisdrwiids0_missingreport_graph(start=load_org) - report_graph=cuahsihisdrwiids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisfarmrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisfarmrwaids0.py deleted file mode 100644 index 920612b5..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisfarmrwaids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisfarmrwaids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisfarmrwaids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisfarmrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisfarmrwaids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisfarmrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisfarmrwaids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisfarmrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisfarmrwaids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisfarmrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisfarmrwaids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisfarmrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisfarmrwaids0_uploadrelease(context): - returned_value = postRelease("cuahsihisfarmrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisfarmrwaids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfarmrwaids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisfarmrwaids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisfarmrwaids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfarmrwaids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisfarmrwaids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisfarmrwaids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfarmrwaids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisfarmrwaids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisfarmrwaids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfarmrwaids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisfarmrwaids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisfarmrwaids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisfarmrwaids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisfarmrwaids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisfarmrwaids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisfarmrwaids0(): - containers = cuahsihisfarmrwaids0_getImage() - harvest = cuahsihisfarmrwaids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisfarmrwaids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisfarmrwaids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisfarmrwaids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisfarmrwaids0") - load_release = cuahsihisfarmrwaids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisfarmrwaids0_uploadrelease(start=load_release) - - load_prune = cuahsihisfarmrwaids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisfarmrwaids0_nabuprov(start=load_prune) - load_org = cuahsihisfarmrwaids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisfarmrwaids0_missingreport_graph(start=load_org) - report_graph=cuahsihisfarmrwaids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisfcelterids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisfcelterids0.py deleted file mode 100644 index c192b4e5..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisfcelterids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisfcelterids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisfcelterids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisfcelterids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisfcelterids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisfcelterids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisfcelterids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisfcelterids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisfcelterids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisfcelterids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisfcelterids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisfcelterids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisfcelterids0_uploadrelease(context): - returned_value = postRelease("cuahsihisfcelterids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisfcelterids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfcelterids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisfcelterids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisfcelterids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfcelterids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisfcelterids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisfcelterids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfcelterids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisfcelterids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisfcelterids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfcelterids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisfcelterids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisfcelterids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisfcelterids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisfcelterids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisfcelterids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisfcelterids0(): - containers = cuahsihisfcelterids0_getImage() - harvest = cuahsihisfcelterids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisfcelterids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisfcelterids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisfcelterids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisfcelterids0") - load_release = cuahsihisfcelterids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisfcelterids0_uploadrelease(start=load_release) - - load_prune = cuahsihisfcelterids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisfcelterids0_nabuprov(start=load_prune) - load_org = cuahsihisfcelterids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisfcelterids0_missingreport_graph(start=load_org) - report_graph=cuahsihisfcelterids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisfrcwqmids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisfrcwqmids0.py deleted file mode 100644 index 5529ed68..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisfrcwqmids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisfrcwqmids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisfrcwqmids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisfrcwqmids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisfrcwqmids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisfrcwqmids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisfrcwqmids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisfrcwqmids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisfrcwqmids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisfrcwqmids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisfrcwqmids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisfrcwqmids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisfrcwqmids0_uploadrelease(context): - returned_value = postRelease("cuahsihisfrcwqmids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisfrcwqmids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfrcwqmids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisfrcwqmids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisfrcwqmids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfrcwqmids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisfrcwqmids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisfrcwqmids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfrcwqmids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisfrcwqmids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisfrcwqmids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisfrcwqmids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisfrcwqmids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisfrcwqmids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisfrcwqmids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisfrcwqmids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisfrcwqmids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisfrcwqmids0(): - containers = cuahsihisfrcwqmids0_getImage() - harvest = cuahsihisfrcwqmids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisfrcwqmids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisfrcwqmids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisfrcwqmids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisfrcwqmids0") - load_release = cuahsihisfrcwqmids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisfrcwqmids0_uploadrelease(start=load_release) - - load_prune = cuahsihisfrcwqmids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisfrcwqmids0_nabuprov(start=load_prune) - load_org = cuahsihisfrcwqmids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisfrcwqmids0_missingreport_graph(start=load_org) - report_graph=cuahsihisfrcwqmids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisghcnids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisghcnids0.py deleted file mode 100644 index 8d042b85..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisghcnids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisghcnids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisghcnids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisghcnids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisghcnids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisghcnids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisghcnids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisghcnids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisghcnids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisghcnids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisghcnids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisghcnids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisghcnids0_uploadrelease(context): - returned_value = postRelease("cuahsihisghcnids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisghcnids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisghcnids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisghcnids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisghcnids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisghcnids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisghcnids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisghcnids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisghcnids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisghcnids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisghcnids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisghcnids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisghcnids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisghcnids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisghcnids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisghcnids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisghcnids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisghcnids0(): - containers = cuahsihisghcnids0_getImage() - harvest = cuahsihisghcnids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisghcnids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisghcnids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisghcnids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisghcnids0") - load_release = cuahsihisghcnids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisghcnids0_uploadrelease(start=load_release) - - load_prune = cuahsihisghcnids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisghcnids0_nabuprov(start=load_prune) - load_org = cuahsihisghcnids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisghcnids0_missingreport_graph(start=load_org) - report_graph=cuahsihisghcnids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisglacialridgeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisglacialridgeids0.py deleted file mode 100644 index a5966474..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisglacialridgeids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisglacialridgeids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisglacialridgeids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisglacialridgeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisglacialridgeids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisglacialridgeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisglacialridgeids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisglacialridgeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisglacialridgeids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisglacialridgeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisglacialridgeids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisglacialridgeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisglacialridgeids0_uploadrelease(context): - returned_value = postRelease("cuahsihisglacialridgeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisglacialridgeids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisglacialridgeids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisglacialridgeids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisglacialridgeids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisglacialridgeids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisglacialridgeids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisglacialridgeids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisglacialridgeids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisglacialridgeids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisglacialridgeids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisglacialridgeids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisglacialridgeids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisglacialridgeids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisglacialridgeids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisglacialridgeids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisglacialridgeids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisglacialridgeids0(): - containers = cuahsihisglacialridgeids0_getImage() - harvest = cuahsihisglacialridgeids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisglacialridgeids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisglacialridgeids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisglacialridgeids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisglacialridgeids0") - load_release = cuahsihisglacialridgeids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisglacialridgeids0_uploadrelease(start=load_release) - - load_prune = cuahsihisglacialridgeids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisglacialridgeids0_nabuprov(start=load_prune) - load_org = cuahsihisglacialridgeids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisglacialridgeids0_missingreport_graph(start=load_org) - report_graph=cuahsihisglacialridgeids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleonauburnids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleonauburnids0.py deleted file mode 100644 index 418482c3..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleonauburnids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisgleonauburnids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonauburnids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisgleonauburnids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonauburnids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisgleonauburnids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonauburnids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisgleonauburnids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonauburnids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisgleonauburnids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonauburnids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisgleonauburnids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonauburnids0_uploadrelease(context): - returned_value = postRelease("cuahsihisgleonauburnids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonauburnids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonauburnids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgleonauburnids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonauburnids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonauburnids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgleonauburnids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonauburnids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonauburnids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgleonauburnids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonauburnids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonauburnids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgleonauburnids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonauburnids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgleonauburnids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisgleonauburnids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisgleonauburnids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisgleonauburnids0(): - containers = cuahsihisgleonauburnids0_getImage() - harvest = cuahsihisgleonauburnids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisgleonauburnids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisgleonauburnids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisgleonauburnids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisgleonauburnids0") - load_release = cuahsihisgleonauburnids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisgleonauburnids0_uploadrelease(start=load_release) - - load_prune = cuahsihisgleonauburnids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisgleonauburnids0_nabuprov(start=load_prune) - load_org = cuahsihisgleonauburnids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisgleonauburnids0_missingreport_graph(start=load_org) - report_graph=cuahsihisgleonauburnids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleondorsetids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleondorsetids0.py deleted file mode 100644 index 88ce2d7b..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleondorsetids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisgleondorsetids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisgleondorsetids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisgleondorsetids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleondorsetids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisgleondorsetids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleondorsetids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisgleondorsetids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleondorsetids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisgleondorsetids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleondorsetids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisgleondorsetids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisgleondorsetids0_uploadrelease(context): - returned_value = postRelease("cuahsihisgleondorsetids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleondorsetids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleondorsetids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgleondorsetids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisgleondorsetids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleondorsetids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgleondorsetids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisgleondorsetids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleondorsetids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgleondorsetids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleondorsetids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleondorsetids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgleondorsetids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleondorsetids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgleondorsetids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisgleondorsetids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisgleondorsetids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisgleondorsetids0(): - containers = cuahsihisgleondorsetids0_getImage() - harvest = cuahsihisgleondorsetids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisgleondorsetids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisgleondorsetids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisgleondorsetids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisgleondorsetids0") - load_release = cuahsihisgleondorsetids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisgleondorsetids0_uploadrelease(start=load_release) - - load_prune = cuahsihisgleondorsetids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisgleondorsetids0_nabuprov(start=load_prune) - load_org = cuahsihisgleondorsetids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisgleondorsetids0_missingreport_graph(start=load_org) - report_graph=cuahsihisgleondorsetids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleonlakeannieids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleonlakeannieids0.py deleted file mode 100644 index 13ec911c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleonlakeannieids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisgleonlakeannieids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonlakeannieids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisgleonlakeannieids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonlakeannieids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisgleonlakeannieids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonlakeannieids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisgleonlakeannieids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonlakeannieids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisgleonlakeannieids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonlakeannieids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisgleonlakeannieids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonlakeannieids0_uploadrelease(context): - returned_value = postRelease("cuahsihisgleonlakeannieids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonlakeannieids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonlakeannieids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgleonlakeannieids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonlakeannieids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonlakeannieids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgleonlakeannieids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonlakeannieids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonlakeannieids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgleonlakeannieids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonlakeannieids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonlakeannieids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgleonlakeannieids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonlakeannieids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgleonlakeannieids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisgleonlakeannieids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisgleonlakeannieids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisgleonlakeannieids0(): - containers = cuahsihisgleonlakeannieids0_getImage() - harvest = cuahsihisgleonlakeannieids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisgleonlakeannieids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisgleonlakeannieids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisgleonlakeannieids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisgleonlakeannieids0") - load_release = cuahsihisgleonlakeannieids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisgleonlakeannieids0_uploadrelease(start=load_release) - - load_prune = cuahsihisgleonlakeannieids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisgleonlakeannieids0_nabuprov(start=load_prune) - load_org = cuahsihisgleonlakeannieids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisgleonlakeannieids0_missingreport_graph(start=load_org) - report_graph=cuahsihisgleonlakeannieids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleonsunapeeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleonsunapeeids0.py deleted file mode 100644 index 1560b6da..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgleonsunapeeids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisgleonsunapeeids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonsunapeeids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisgleonsunapeeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonsunapeeids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisgleonsunapeeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonsunapeeids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisgleonsunapeeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonsunapeeids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisgleonsunapeeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonsunapeeids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisgleonsunapeeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonsunapeeids0_uploadrelease(context): - returned_value = postRelease("cuahsihisgleonsunapeeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonsunapeeids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonsunapeeids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgleonsunapeeids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonsunapeeids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonsunapeeids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgleonsunapeeids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonsunapeeids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonsunapeeids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgleonsunapeeids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonsunapeeids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgleonsunapeeids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgleonsunapeeids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgleonsunapeeids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgleonsunapeeids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisgleonsunapeeids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisgleonsunapeeids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisgleonsunapeeids0(): - containers = cuahsihisgleonsunapeeids0_getImage() - harvest = cuahsihisgleonsunapeeids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisgleonsunapeeids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisgleonsunapeeids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisgleonsunapeeids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisgleonsunapeeids0") - load_release = cuahsihisgleonsunapeeids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisgleonsunapeeids0_uploadrelease(start=load_release) - - load_prune = cuahsihisgleonsunapeeids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisgleonsunapeeids0_nabuprov(start=load_prune) - load_org = cuahsihisgleonsunapeeids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisgleonsunapeeids0_missingreport_graph(start=load_org) - report_graph=cuahsihisgleonsunapeeids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisglobalriversobservatoryids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisglobalriversobservatoryids0.py deleted file mode 100644 index f4654481..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisglobalriversobservatoryids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisglobalriversobservatoryids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisglobalriversobservatoryids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisglobalriversobservatoryids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisglobalriversobservatoryids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisglobalriversobservatoryids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisglobalriversobservatoryids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisglobalriversobservatoryids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisglobalriversobservatoryids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisglobalriversobservatoryids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisglobalriversobservatoryids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisglobalriversobservatoryids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisglobalriversobservatoryids0_uploadrelease(context): - returned_value = postRelease("cuahsihisglobalriversobservatoryids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisglobalriversobservatoryids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisglobalriversobservatoryids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisglobalriversobservatoryids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisglobalriversobservatoryids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisglobalriversobservatoryids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisglobalriversobservatoryids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisglobalriversobservatoryids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisglobalriversobservatoryids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisglobalriversobservatoryids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisglobalriversobservatoryids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisglobalriversobservatoryids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisglobalriversobservatoryids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisglobalriversobservatoryids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisglobalriversobservatoryids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisglobalriversobservatoryids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisglobalriversobservatoryids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisglobalriversobservatoryids0(): - containers = cuahsihisglobalriversobservatoryids0_getImage() - harvest = cuahsihisglobalriversobservatoryids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisglobalriversobservatoryids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisglobalriversobservatoryids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisglobalriversobservatoryids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisglobalriversobservatoryids0") - load_release = cuahsihisglobalriversobservatoryids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisglobalriversobservatoryids0_uploadrelease(start=load_release) - - load_prune = cuahsihisglobalriversobservatoryids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisglobalriversobservatoryids0_nabuprov(start=load_prune) - load_org = cuahsihisglobalriversobservatoryids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisglobalriversobservatoryids0_missingreport_graph(start=load_org) - report_graph=cuahsihisglobalriversobservatoryids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgonggaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgonggaids0.py deleted file mode 100644 index 1b78b0e7..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisgonggaids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisgonggaids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisgonggaids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisgonggaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgonggaids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisgonggaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgonggaids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisgonggaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgonggaids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisgonggaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgonggaids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisgonggaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisgonggaids0_uploadrelease(context): - returned_value = postRelease("cuahsihisgonggaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisgonggaids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgonggaids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgonggaids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisgonggaids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgonggaids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgonggaids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisgonggaids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgonggaids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgonggaids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgonggaids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisgonggaids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgonggaids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisgonggaids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisgonggaids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisgonggaids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisgonggaids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisgonggaids0(): - containers = cuahsihisgonggaids0_getImage() - harvest = cuahsihisgonggaids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisgonggaids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisgonggaids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisgonggaids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisgonggaids0") - load_release = cuahsihisgonggaids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisgonggaids0_uploadrelease(start=load_release) - - load_prune = cuahsihisgonggaids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisgonggaids0_nabuprov(start=load_prune) - load_org = cuahsihisgonggaids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisgonggaids0_missingreport_graph(start=load_org) - report_graph=cuahsihisgonggaids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihishassbergeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihishassbergeids0.py deleted file mode 100644 index 5ac06488..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihishassbergeids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihishassbergeids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihishassbergeids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihishassbergeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihishassbergeids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihishassbergeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihishassbergeids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihishassbergeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihishassbergeids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihishassbergeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihishassbergeids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihishassbergeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihishassbergeids0_uploadrelease(context): - returned_value = postRelease("cuahsihishassbergeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihishassbergeids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishassbergeids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihishassbergeids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihishassbergeids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishassbergeids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihishassbergeids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihishassbergeids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishassbergeids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihishassbergeids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihishassbergeids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishassbergeids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihishassbergeids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihishassbergeids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihishassbergeids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihishassbergeids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihishassbergeids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihishassbergeids0(): - containers = cuahsihishassbergeids0_getImage() - harvest = cuahsihishassbergeids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihishassbergeids0_missingreport_s3(start=harvest) - report_idstat = cuahsihishassbergeids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihishassbergeids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihishassbergeids0") - load_release = cuahsihishassbergeids0_naburelease(start=harvest) - load_uploadrelease = cuahsihishassbergeids0_uploadrelease(start=load_release) - - load_prune = cuahsihishassbergeids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihishassbergeids0_nabuprov(start=load_prune) - load_org = cuahsihishassbergeids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihishassbergeids0_missingreport_graph(start=load_org) - report_graph=cuahsihishassbergeids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihishydrodataczdids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihishydrodataczdids0.py deleted file mode 100644 index a3c669b4..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihishydrodataczdids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihishydrodataczdids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihishydrodataczdids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihishydrodataczdids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihishydrodataczdids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihishydrodataczdids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihishydrodataczdids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihishydrodataczdids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihishydrodataczdids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihishydrodataczdids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihishydrodataczdids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihishydrodataczdids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihishydrodataczdids0_uploadrelease(context): - returned_value = postRelease("cuahsihishydrodataczdids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihishydrodataczdids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishydrodataczdids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihishydrodataczdids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihishydrodataczdids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishydrodataczdids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihishydrodataczdids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihishydrodataczdids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishydrodataczdids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihishydrodataczdids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihishydrodataczdids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishydrodataczdids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihishydrodataczdids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihishydrodataczdids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihishydrodataczdids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihishydrodataczdids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihishydrodataczdids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihishydrodataczdids0(): - containers = cuahsihishydrodataczdids0_getImage() - harvest = cuahsihishydrodataczdids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihishydrodataczdids0_missingreport_s3(start=harvest) - report_idstat = cuahsihishydrodataczdids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihishydrodataczdids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihishydrodataczdids0") - load_release = cuahsihishydrodataczdids0_naburelease(start=harvest) - load_uploadrelease = cuahsihishydrodataczdids0_uploadrelease(start=load_release) - - load_prune = cuahsihishydrodataczdids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihishydrodataczdids0_nabuprov(start=load_prune) - load_org = cuahsihishydrodataczdids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihishydrodataczdids0_missingreport_graph(start=load_org) - report_graph=cuahsihishydrodataczdids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihishydrodataczhrids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihishydrodataczhrids0.py deleted file mode 100644 index 0b18da56..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihishydrodataczhrids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihishydrodataczhrids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihishydrodataczhrids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihishydrodataczhrids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihishydrodataczhrids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihishydrodataczhrids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihishydrodataczhrids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihishydrodataczhrids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihishydrodataczhrids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihishydrodataczhrids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihishydrodataczhrids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihishydrodataczhrids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihishydrodataczhrids0_uploadrelease(context): - returned_value = postRelease("cuahsihishydrodataczhrids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihishydrodataczhrids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishydrodataczhrids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihishydrodataczhrids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihishydrodataczhrids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishydrodataczhrids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihishydrodataczhrids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihishydrodataczhrids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishydrodataczhrids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihishydrodataczhrids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihishydrodataczhrids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihishydrodataczhrids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihishydrodataczhrids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihishydrodataczhrids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihishydrodataczhrids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihishydrodataczhrids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihishydrodataczhrids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihishydrodataczhrids0(): - containers = cuahsihishydrodataczhrids0_getImage() - harvest = cuahsihishydrodataczhrids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihishydrodataczhrids0_missingreport_s3(start=harvest) - report_idstat = cuahsihishydrodataczhrids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihishydrodataczhrids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihishydrodataczhrids0") - load_release = cuahsihishydrodataczhrids0_naburelease(start=harvest) - load_uploadrelease = cuahsihishydrodataczhrids0_uploadrelease(start=load_release) - - load_prune = cuahsihishydrodataczhrids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihishydrodataczhrids0_nabuprov(start=load_prune) - load_org = cuahsihishydrodataczhrids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihishydrodataczhrids0_missingreport_graph(start=load_org) - report_graph=cuahsihishydrodataczhrids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisieeratwilkesuniversityids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisieeratwilkesuniversityids0.py deleted file mode 100644 index fbbf3b93..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisieeratwilkesuniversityids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisieeratwilkesuniversityids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisieeratwilkesuniversityids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisieeratwilkesuniversityids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisieeratwilkesuniversityids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisieeratwilkesuniversityids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisieeratwilkesuniversityids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisieeratwilkesuniversityids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisieeratwilkesuniversityids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisieeratwilkesuniversityids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisieeratwilkesuniversityids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisieeratwilkesuniversityids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisieeratwilkesuniversityids0_uploadrelease(context): - returned_value = postRelease("cuahsihisieeratwilkesuniversityids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisieeratwilkesuniversityids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisieeratwilkesuniversityids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisieeratwilkesuniversityids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisieeratwilkesuniversityids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisieeratwilkesuniversityids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisieeratwilkesuniversityids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisieeratwilkesuniversityids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisieeratwilkesuniversityids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisieeratwilkesuniversityids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisieeratwilkesuniversityids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisieeratwilkesuniversityids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisieeratwilkesuniversityids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisieeratwilkesuniversityids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisieeratwilkesuniversityids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisieeratwilkesuniversityids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisieeratwilkesuniversityids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisieeratwilkesuniversityids0(): - containers = cuahsihisieeratwilkesuniversityids0_getImage() - harvest = cuahsihisieeratwilkesuniversityids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisieeratwilkesuniversityids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisieeratwilkesuniversityids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisieeratwilkesuniversityids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisieeratwilkesuniversityids0") - load_release = cuahsihisieeratwilkesuniversityids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisieeratwilkesuniversityids0_uploadrelease(start=load_release) - - load_prune = cuahsihisieeratwilkesuniversityids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisieeratwilkesuniversityids0_nabuprov(start=load_prune) - load_org = cuahsihisieeratwilkesuniversityids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisieeratwilkesuniversityids0_missingreport_graph(start=load_org) - report_graph=cuahsihisieeratwilkesuniversityids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisirwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisirwaids0.py deleted file mode 100644 index 0fb7e4b2..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisirwaids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisirwaids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisirwaids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisirwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisirwaids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisirwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisirwaids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisirwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisirwaids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisirwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisirwaids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisirwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisirwaids0_uploadrelease(context): - returned_value = postRelease("cuahsihisirwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisirwaids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisirwaids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisirwaids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisirwaids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisirwaids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisirwaids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisirwaids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisirwaids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisirwaids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisirwaids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisirwaids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisirwaids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisirwaids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisirwaids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisirwaids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisirwaids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisirwaids0(): - containers = cuahsihisirwaids0_getImage() - harvest = cuahsihisirwaids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisirwaids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisirwaids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisirwaids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisirwaids0") - load_release = cuahsihisirwaids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisirwaids0_uploadrelease(start=load_release) - - load_prune = cuahsihisirwaids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisirwaids0_nabuprov(start=load_prune) - load_org = cuahsihisirwaids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisirwaids0_missingreport_graph(start=load_org) - report_graph=cuahsihisirwaids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisisbenaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisisbenaids0.py deleted file mode 100644 index e58f4e18..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisisbenaids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisisbenaids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisisbenaids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisisbenaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisisbenaids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisisbenaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisisbenaids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisisbenaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisisbenaids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisisbenaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisisbenaids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisisbenaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisisbenaids0_uploadrelease(context): - returned_value = postRelease("cuahsihisisbenaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisisbenaids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisisbenaids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisisbenaids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisisbenaids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisisbenaids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisisbenaids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisisbenaids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisisbenaids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisisbenaids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisisbenaids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisisbenaids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisisbenaids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisisbenaids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisisbenaids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisisbenaids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisisbenaids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisisbenaids0(): - containers = cuahsihisisbenaids0_getImage() - harvest = cuahsihisisbenaids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisisbenaids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisisbenaids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisisbenaids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisisbenaids0") - load_release = cuahsihisisbenaids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisisbenaids0_uploadrelease(start=load_release) - - load_prune = cuahsihisisbenaids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisisbenaids0_nabuprov(start=load_prune) - load_org = cuahsihisisbenaids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisisbenaids0_missingreport_graph(start=load_org) - report_graph=cuahsihisisbenaids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiskansasweatherdataids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiskansasweatherdataids0.py deleted file mode 100644 index 5974a395..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihiskansasweatherdataids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihiskansasweatherdataids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihiskansasweatherdataids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihiskansasweatherdataids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiskansasweatherdataids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihiskansasweatherdataids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiskansasweatherdataids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihiskansasweatherdataids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiskansasweatherdataids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihiskansasweatherdataids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiskansasweatherdataids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihiskansasweatherdataids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihiskansasweatherdataids0_uploadrelease(context): - returned_value = postRelease("cuahsihiskansasweatherdataids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihiskansasweatherdataids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiskansasweatherdataids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiskansasweatherdataids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihiskansasweatherdataids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiskansasweatherdataids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiskansasweatherdataids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihiskansasweatherdataids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiskansasweatherdataids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiskansasweatherdataids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiskansasweatherdataids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihiskansasweatherdataids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiskansasweatherdataids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihiskansasweatherdataids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihiskansasweatherdataids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihiskansasweatherdataids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihiskansasweatherdataids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihiskansasweatherdataids0(): - containers = cuahsihiskansasweatherdataids0_getImage() - harvest = cuahsihiskansasweatherdataids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihiskansasweatherdataids0_missingreport_s3(start=harvest) - report_idstat = cuahsihiskansasweatherdataids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihiskansasweatherdataids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihiskansasweatherdataids0") - load_release = cuahsihiskansasweatherdataids0_naburelease(start=harvest) - load_uploadrelease = cuahsihiskansasweatherdataids0_uploadrelease(start=load_release) - - load_prune = cuahsihiskansasweatherdataids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihiskansasweatherdataids0_nabuprov(start=load_prune) - load_org = cuahsihiskansasweatherdataids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihiskansasweatherdataids0_missingreport_graph(start=load_org) - report_graph=cuahsihiskansasweatherdataids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislaselvastreamdischargeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislaselvastreamdischargeids0.py deleted file mode 100644 index a54be7f1..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislaselvastreamdischargeids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihislaselvastreamdischargeids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihislaselvastreamdischargeids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihislaselvastreamdischargeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislaselvastreamdischargeids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihislaselvastreamdischargeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislaselvastreamdischargeids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihislaselvastreamdischargeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislaselvastreamdischargeids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihislaselvastreamdischargeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislaselvastreamdischargeids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihislaselvastreamdischargeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihislaselvastreamdischargeids0_uploadrelease(context): - returned_value = postRelease("cuahsihislaselvastreamdischargeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihislaselvastreamdischargeids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislaselvastreamdischargeids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihislaselvastreamdischargeids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihislaselvastreamdischargeids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislaselvastreamdischargeids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihislaselvastreamdischargeids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihislaselvastreamdischargeids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislaselvastreamdischargeids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihislaselvastreamdischargeids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislaselvastreamdischargeids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislaselvastreamdischargeids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihislaselvastreamdischargeids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislaselvastreamdischargeids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihislaselvastreamdischargeids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihislaselvastreamdischargeids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihislaselvastreamdischargeids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihislaselvastreamdischargeids0(): - containers = cuahsihislaselvastreamdischargeids0_getImage() - harvest = cuahsihislaselvastreamdischargeids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihislaselvastreamdischargeids0_missingreport_s3(start=harvest) - report_idstat = cuahsihislaselvastreamdischargeids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihislaselvastreamdischargeids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihislaselvastreamdischargeids0") - load_release = cuahsihislaselvastreamdischargeids0_naburelease(start=harvest) - load_uploadrelease = cuahsihislaselvastreamdischargeids0_uploadrelease(start=load_release) - - load_prune = cuahsihislaselvastreamdischargeids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihislaselvastreamdischargeids0_nabuprov(start=load_prune) - load_org = cuahsihislaselvastreamdischargeids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihislaselvastreamdischargeids0_missingreport_graph(start=load_org) - report_graph=cuahsihislaselvastreamdischargeids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislczoodm2ids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislczoodm2ids0.py deleted file mode 100644 index c744b8b5..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislczoodm2ids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihislczoodm2ids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihislczoodm2ids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihislczoodm2ids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislczoodm2ids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihislczoodm2ids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislczoodm2ids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihislczoodm2ids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislczoodm2ids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihislczoodm2ids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislczoodm2ids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihislczoodm2ids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihislczoodm2ids0_uploadrelease(context): - returned_value = postRelease("cuahsihislczoodm2ids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihislczoodm2ids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislczoodm2ids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihislczoodm2ids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihislczoodm2ids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislczoodm2ids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihislczoodm2ids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihislczoodm2ids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislczoodm2ids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihislczoodm2ids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislczoodm2ids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislczoodm2ids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihislczoodm2ids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislczoodm2ids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihislczoodm2ids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihislczoodm2ids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihislczoodm2ids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihislczoodm2ids0(): - containers = cuahsihislczoodm2ids0_getImage() - harvest = cuahsihislczoodm2ids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihislczoodm2ids0_missingreport_s3(start=harvest) - report_idstat = cuahsihislczoodm2ids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihislczoodm2ids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihislczoodm2ids0") - load_release = cuahsihislczoodm2ids0_naburelease(start=harvest) - load_uploadrelease = cuahsihislczoodm2ids0_uploadrelease(start=load_release) - - load_prune = cuahsihislczoodm2ids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihislczoodm2ids0_nabuprov(start=load_prune) - load_org = cuahsihislczoodm2ids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihislczoodm2ids0_missingreport_graph(start=load_org) - report_graph=cuahsihislczoodm2ids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislittlebearriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislittlebearriverids0.py deleted file mode 100644 index 68aead74..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislittlebearriverids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihislittlebearriverids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihislittlebearriverids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihislittlebearriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislittlebearriverids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihislittlebearriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislittlebearriverids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihislittlebearriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislittlebearriverids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihislittlebearriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislittlebearriverids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihislittlebearriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihislittlebearriverids0_uploadrelease(context): - returned_value = postRelease("cuahsihislittlebearriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihislittlebearriverids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislittlebearriverids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihislittlebearriverids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihislittlebearriverids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislittlebearriverids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihislittlebearriverids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihislittlebearriverids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislittlebearriverids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihislittlebearriverids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislittlebearriverids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislittlebearriverids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihislittlebearriverids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislittlebearriverids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihislittlebearriverids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihislittlebearriverids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihislittlebearriverids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihislittlebearriverids0(): - containers = cuahsihislittlebearriverids0_getImage() - harvest = cuahsihislittlebearriverids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihislittlebearriverids0_missingreport_s3(start=harvest) - report_idstat = cuahsihislittlebearriverids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihislittlebearriverids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihislittlebearriverids0") - load_release = cuahsihislittlebearriverids0_naburelease(start=harvest) - load_uploadrelease = cuahsihislittlebearriverids0_uploadrelease(start=load_release) - - load_prune = cuahsihislittlebearriverids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihislittlebearriverids0_nabuprov(start=load_prune) - load_org = cuahsihislittlebearriverids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihislittlebearriverids0_missingreport_graph(start=load_org) - report_graph=cuahsihislittlebearriverids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisloganrivergamutids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisloganrivergamutids0.py deleted file mode 100644 index 75145e02..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisloganrivergamutids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisloganrivergamutids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisloganrivergamutids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisloganrivergamutids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisloganrivergamutids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisloganrivergamutids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisloganrivergamutids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisloganrivergamutids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisloganrivergamutids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisloganrivergamutids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisloganrivergamutids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisloganrivergamutids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisloganrivergamutids0_uploadrelease(context): - returned_value = postRelease("cuahsihisloganrivergamutids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisloganrivergamutids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisloganrivergamutids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisloganrivergamutids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisloganrivergamutids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisloganrivergamutids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisloganrivergamutids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisloganrivergamutids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisloganrivergamutids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisloganrivergamutids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisloganrivergamutids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisloganrivergamutids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisloganrivergamutids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisloganrivergamutids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisloganrivergamutids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisloganrivergamutids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisloganrivergamutids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisloganrivergamutids0(): - containers = cuahsihisloganrivergamutids0_getImage() - harvest = cuahsihisloganrivergamutids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisloganrivergamutids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisloganrivergamutids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisloganrivergamutids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisloganrivergamutids0") - load_release = cuahsihisloganrivergamutids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisloganrivergamutids0_uploadrelease(start=load_release) - - load_prune = cuahsihisloganrivergamutids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisloganrivergamutids0_nabuprov(start=load_prune) - load_org = cuahsihisloganrivergamutids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisloganrivergamutids0_missingreport_graph(start=load_org) - report_graph=cuahsihisloganrivergamutids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisloganriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisloganriverids0.py deleted file mode 100644 index cb43eeef..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisloganriverids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisloganriverids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisloganriverids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisloganriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisloganriverids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisloganriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisloganriverids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisloganriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisloganriverids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisloganriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisloganriverids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisloganriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisloganriverids0_uploadrelease(context): - returned_value = postRelease("cuahsihisloganriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisloganriverids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisloganriverids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisloganriverids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisloganriverids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisloganriverids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisloganriverids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisloganriverids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisloganriverids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisloganriverids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisloganriverids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisloganriverids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisloganriverids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisloganriverids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisloganriverids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisloganriverids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisloganriverids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisloganriverids0(): - containers = cuahsihisloganriverids0_getImage() - harvest = cuahsihisloganriverids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisloganriverids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisloganriverids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisloganriverids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisloganriverids0") - load_release = cuahsihisloganriverids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisloganriverids0_uploadrelease(start=load_release) - - load_prune = cuahsihisloganriverids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisloganriverids0_nabuprov(start=load_prune) - load_org = cuahsihisloganriverids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisloganriverids0_missingreport_graph(start=load_org) - report_graph=cuahsihisloganriverids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislterntlwoodruffids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislterntlwoodruffids0.py deleted file mode 100644 index 2c4810a4..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihislterntlwoodruffids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihislterntlwoodruffids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihislterntlwoodruffids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihislterntlwoodruffids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislterntlwoodruffids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihislterntlwoodruffids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislterntlwoodruffids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihislterntlwoodruffids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislterntlwoodruffids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihislterntlwoodruffids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislterntlwoodruffids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihislterntlwoodruffids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihislterntlwoodruffids0_uploadrelease(context): - returned_value = postRelease("cuahsihislterntlwoodruffids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihislterntlwoodruffids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislterntlwoodruffids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihislterntlwoodruffids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihislterntlwoodruffids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislterntlwoodruffids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihislterntlwoodruffids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihislterntlwoodruffids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislterntlwoodruffids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihislterntlwoodruffids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislterntlwoodruffids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihislterntlwoodruffids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihislterntlwoodruffids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihislterntlwoodruffids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihislterntlwoodruffids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihislterntlwoodruffids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihislterntlwoodruffids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihislterntlwoodruffids0(): - containers = cuahsihislterntlwoodruffids0_getImage() - harvest = cuahsihislterntlwoodruffids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihislterntlwoodruffids0_missingreport_s3(start=harvest) - report_idstat = cuahsihislterntlwoodruffids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihislterntlwoodruffids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihislterntlwoodruffids0") - load_release = cuahsihislterntlwoodruffids0_naburelease(start=harvest) - load_uploadrelease = cuahsihislterntlwoodruffids0_uploadrelease(start=load_release) - - load_prune = cuahsihislterntlwoodruffids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihislterntlwoodruffids0_nabuprov(start=load_prune) - load_org = cuahsihislterntlwoodruffids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihislterntlwoodruffids0_missingreport_graph(start=load_org) - report_graph=cuahsihislterntlwoodruffids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisluwlids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisluwlids0.py deleted file mode 100644 index 44cdaa25..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisluwlids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisluwlids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisluwlids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisluwlids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisluwlids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisluwlids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisluwlids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisluwlids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisluwlids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisluwlids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisluwlids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisluwlids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisluwlids0_uploadrelease(context): - returned_value = postRelease("cuahsihisluwlids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisluwlids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisluwlids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisluwlids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisluwlids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisluwlids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisluwlids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisluwlids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisluwlids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisluwlids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisluwlids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisluwlids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisluwlids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisluwlids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisluwlids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisluwlids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisluwlids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisluwlids0(): - containers = cuahsihisluwlids0_getImage() - harvest = cuahsihisluwlids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisluwlids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisluwlids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisluwlids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisluwlids0") - load_release = cuahsihisluwlids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisluwlids0_uploadrelease(start=load_release) - - load_prune = cuahsihisluwlids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisluwlids0_nabuprov(start=load_prune) - load_org = cuahsihisluwlids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisluwlids0_missingreport_graph(start=load_org) - report_graph=cuahsihisluwlids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismaaeriids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismaaeriids0.py deleted file mode 100644 index 72bd6b1e..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismaaeriids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihismaaeriids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihismaaeriids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihismaaeriids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismaaeriids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihismaaeriids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismaaeriids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihismaaeriids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismaaeriids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihismaaeriids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismaaeriids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihismaaeriids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismaaeriids0_uploadrelease(context): - returned_value = postRelease("cuahsihismaaeriids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihismaaeriids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismaaeriids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismaaeriids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismaaeriids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismaaeriids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismaaeriids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismaaeriids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismaaeriids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismaaeriids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismaaeriids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismaaeriids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismaaeriids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismaaeriids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismaaeriids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihismaaeriids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihismaaeriids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihismaaeriids0(): - containers = cuahsihismaaeriids0_getImage() - harvest = cuahsihismaaeriids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihismaaeriids0_missingreport_s3(start=harvest) - report_idstat = cuahsihismaaeriids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihismaaeriids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihismaaeriids0") - load_release = cuahsihismaaeriids0_naburelease(start=harvest) - load_uploadrelease = cuahsihismaaeriids0_uploadrelease(start=load_release) - - load_prune = cuahsihismaaeriids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihismaaeriids0_nabuprov(start=load_prune) - load_org = cuahsihismaaeriids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihismaaeriids0_missingreport_graph(start=load_org) - report_graph=cuahsihismaaeriids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismazarriverprojectids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismazarriverprojectids0.py deleted file mode 100644 index 1bda0d7a..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismazarriverprojectids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihismazarriverprojectids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihismazarriverprojectids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihismazarriverprojectids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismazarriverprojectids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihismazarriverprojectids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismazarriverprojectids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihismazarriverprojectids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismazarriverprojectids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihismazarriverprojectids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismazarriverprojectids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihismazarriverprojectids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismazarriverprojectids0_uploadrelease(context): - returned_value = postRelease("cuahsihismazarriverprojectids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihismazarriverprojectids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismazarriverprojectids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismazarriverprojectids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismazarriverprojectids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismazarriverprojectids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismazarriverprojectids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismazarriverprojectids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismazarriverprojectids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismazarriverprojectids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismazarriverprojectids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismazarriverprojectids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismazarriverprojectids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismazarriverprojectids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismazarriverprojectids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihismazarriverprojectids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihismazarriverprojectids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihismazarriverprojectids0(): - containers = cuahsihismazarriverprojectids0_getImage() - harvest = cuahsihismazarriverprojectids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihismazarriverprojectids0_missingreport_s3(start=harvest) - report_idstat = cuahsihismazarriverprojectids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihismazarriverprojectids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihismazarriverprojectids0") - load_release = cuahsihismazarriverprojectids0_naburelease(start=harvest) - load_uploadrelease = cuahsihismazarriverprojectids0_uploadrelease(start=load_release) - - load_prune = cuahsihismazarriverprojectids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihismazarriverprojectids0_nabuprov(start=load_prune) - load_org = cuahsihismazarriverprojectids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihismazarriverprojectids0_missingreport_graph(start=load_org) - report_graph=cuahsihismazarriverprojectids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismmaatacamaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismmaatacamaids0.py deleted file mode 100644 index 8f2dd38c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismmaatacamaids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihismmaatacamaids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihismmaatacamaids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihismmaatacamaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismmaatacamaids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihismmaatacamaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismmaatacamaids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihismmaatacamaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismmaatacamaids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihismmaatacamaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismmaatacamaids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihismmaatacamaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismmaatacamaids0_uploadrelease(context): - returned_value = postRelease("cuahsihismmaatacamaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihismmaatacamaids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismmaatacamaids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismmaatacamaids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismmaatacamaids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismmaatacamaids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismmaatacamaids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismmaatacamaids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismmaatacamaids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismmaatacamaids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismmaatacamaids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismmaatacamaids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismmaatacamaids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismmaatacamaids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismmaatacamaids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihismmaatacamaids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihismmaatacamaids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihismmaatacamaids0(): - containers = cuahsihismmaatacamaids0_getImage() - harvest = cuahsihismmaatacamaids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihismmaatacamaids0_missingreport_s3(start=harvest) - report_idstat = cuahsihismmaatacamaids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihismmaatacamaids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihismmaatacamaids0") - load_release = cuahsihismmaatacamaids0_naburelease(start=harvest) - load_uploadrelease = cuahsihismmaatacamaids0_uploadrelease(start=load_release) - - load_prune = cuahsihismmaatacamaids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihismmaatacamaids0_nabuprov(start=load_prune) - load_org = cuahsihismmaatacamaids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihismmaatacamaids0_missingreport_graph(start=load_org) - report_graph=cuahsihismmaatacamaids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismobilecrowdhydrologyids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismobilecrowdhydrologyids0.py deleted file mode 100644 index bf374bb7..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismobilecrowdhydrologyids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihismobilecrowdhydrologyids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihismobilecrowdhydrologyids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihismobilecrowdhydrologyids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismobilecrowdhydrologyids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihismobilecrowdhydrologyids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismobilecrowdhydrologyids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihismobilecrowdhydrologyids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismobilecrowdhydrologyids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihismobilecrowdhydrologyids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismobilecrowdhydrologyids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihismobilecrowdhydrologyids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismobilecrowdhydrologyids0_uploadrelease(context): - returned_value = postRelease("cuahsihismobilecrowdhydrologyids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihismobilecrowdhydrologyids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismobilecrowdhydrologyids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismobilecrowdhydrologyids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismobilecrowdhydrologyids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismobilecrowdhydrologyids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismobilecrowdhydrologyids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismobilecrowdhydrologyids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismobilecrowdhydrologyids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismobilecrowdhydrologyids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismobilecrowdhydrologyids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismobilecrowdhydrologyids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismobilecrowdhydrologyids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismobilecrowdhydrologyids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismobilecrowdhydrologyids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihismobilecrowdhydrologyids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihismobilecrowdhydrologyids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihismobilecrowdhydrologyids0(): - containers = cuahsihismobilecrowdhydrologyids0_getImage() - harvest = cuahsihismobilecrowdhydrologyids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihismobilecrowdhydrologyids0_missingreport_s3(start=harvest) - report_idstat = cuahsihismobilecrowdhydrologyids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihismobilecrowdhydrologyids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihismobilecrowdhydrologyids0") - load_release = cuahsihismobilecrowdhydrologyids0_naburelease(start=harvest) - load_uploadrelease = cuahsihismobilecrowdhydrologyids0_uploadrelease(start=load_release) - - load_prune = cuahsihismobilecrowdhydrologyids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihismobilecrowdhydrologyids0_nabuprov(start=load_prune) - load_org = cuahsihismobilecrowdhydrologyids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihismobilecrowdhydrologyids0_missingreport_graph(start=load_org) - report_graph=cuahsihismobilecrowdhydrologyids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismopexids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismopexids0.py deleted file mode 100644 index 960b60d8..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismopexids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihismopexids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihismopexids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihismopexids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismopexids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihismopexids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismopexids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihismopexids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismopexids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihismopexids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismopexids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihismopexids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismopexids0_uploadrelease(context): - returned_value = postRelease("cuahsihismopexids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihismopexids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismopexids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismopexids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismopexids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismopexids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismopexids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismopexids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismopexids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismopexids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismopexids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismopexids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismopexids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismopexids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismopexids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihismopexids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihismopexids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihismopexids0(): - containers = cuahsihismopexids0_getImage() - harvest = cuahsihismopexids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihismopexids0_missingreport_s3(start=harvest) - report_idstat = cuahsihismopexids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihismopexids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihismopexids0") - load_release = cuahsihismopexids0_naburelease(start=harvest) - load_uploadrelease = cuahsihismopexids0_uploadrelease(start=load_release) - - load_prune = cuahsihismopexids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihismopexids0_nabuprov(start=load_prune) - load_org = cuahsihismopexids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihismopexids0_missingreport_graph(start=load_org) - report_graph=cuahsihismopexids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismuddyriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismuddyriverids0.py deleted file mode 100644 index 47e28927..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismuddyriverids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihismuddyriverids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihismuddyriverids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihismuddyriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismuddyriverids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihismuddyriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismuddyriverids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihismuddyriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismuddyriverids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihismuddyriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismuddyriverids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihismuddyriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismuddyriverids0_uploadrelease(context): - returned_value = postRelease("cuahsihismuddyriverids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihismuddyriverids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismuddyriverids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismuddyriverids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismuddyriverids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismuddyriverids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismuddyriverids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismuddyriverids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismuddyriverids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismuddyriverids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismuddyriverids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismuddyriverids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismuddyriverids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismuddyriverids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismuddyriverids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihismuddyriverids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihismuddyriverids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihismuddyriverids0(): - containers = cuahsihismuddyriverids0_getImage() - harvest = cuahsihismuddyriverids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihismuddyriverids0_missingreport_s3(start=harvest) - report_idstat = cuahsihismuddyriverids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihismuddyriverids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihismuddyriverids0") - load_release = cuahsihismuddyriverids0_naburelease(start=harvest) - load_uploadrelease = cuahsihismuddyriverids0_uploadrelease(start=load_release) - - load_prune = cuahsihismuddyriverids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihismuddyriverids0_nabuprov(start=load_prune) - load_org = cuahsihismuddyriverids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihismuddyriverids0_missingreport_graph(start=load_org) - report_graph=cuahsihismuddyriverids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismudlakeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismudlakeids0.py deleted file mode 100644 index 261e7f5d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismudlakeids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihismudlakeids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihismudlakeids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihismudlakeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismudlakeids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihismudlakeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismudlakeids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihismudlakeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismudlakeids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihismudlakeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismudlakeids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihismudlakeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismudlakeids0_uploadrelease(context): - returned_value = postRelease("cuahsihismudlakeids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihismudlakeids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismudlakeids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismudlakeids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismudlakeids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismudlakeids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismudlakeids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismudlakeids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismudlakeids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismudlakeids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismudlakeids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismudlakeids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismudlakeids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismudlakeids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismudlakeids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihismudlakeids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihismudlakeids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihismudlakeids0(): - containers = cuahsihismudlakeids0_getImage() - harvest = cuahsihismudlakeids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihismudlakeids0_missingreport_s3(start=harvest) - report_idstat = cuahsihismudlakeids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihismudlakeids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihismudlakeids0") - load_release = cuahsihismudlakeids0_naburelease(start=harvest) - load_uploadrelease = cuahsihismudlakeids0_uploadrelease(start=load_release) - - load_prune = cuahsihismudlakeids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihismudlakeids0_nabuprov(start=load_prune) - load_org = cuahsihismudlakeids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihismudlakeids0_missingreport_graph(start=load_org) - report_graph=cuahsihismudlakeids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismwdisids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismwdisids0.py deleted file mode 100644 index ddf22ca5..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismwdisids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihismwdisids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihismwdisids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihismwdisids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismwdisids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihismwdisids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismwdisids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihismwdisids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismwdisids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihismwdisids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismwdisids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihismwdisids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismwdisids0_uploadrelease(context): - returned_value = postRelease("cuahsihismwdisids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihismwdisids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismwdisids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismwdisids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismwdisids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismwdisids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismwdisids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismwdisids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismwdisids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismwdisids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismwdisids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismwdisids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismwdisids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismwdisids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismwdisids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihismwdisids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihismwdisids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihismwdisids0(): - containers = cuahsihismwdisids0_getImage() - harvest = cuahsihismwdisids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihismwdisids0_missingreport_s3(start=harvest) - report_idstat = cuahsihismwdisids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihismwdisids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihismwdisids0") - load_release = cuahsihismwdisids0_naburelease(start=harvest) - load_uploadrelease = cuahsihismwdisids0_uploadrelease(start=load_release) - - load_prune = cuahsihismwdisids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihismwdisids0_nabuprov(start=load_prune) - load_org = cuahsihismwdisids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihismwdisids0_missingreport_graph(start=load_org) - report_graph=cuahsihismwdisids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismwraids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismwraids0.py deleted file mode 100644 index fe282d56..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihismwraids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihismwraids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihismwraids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihismwraids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismwraids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihismwraids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismwraids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihismwraids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismwraids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihismwraids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismwraids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihismwraids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismwraids0_uploadrelease(context): - returned_value = postRelease("cuahsihismwraids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihismwraids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismwraids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismwraids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismwraids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismwraids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismwraids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihismwraids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismwraids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismwraids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismwraids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihismwraids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismwraids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihismwraids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihismwraids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihismwraids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihismwraids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihismwraids0(): - containers = cuahsihismwraids0_getImage() - harvest = cuahsihismwraids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihismwraids0_missingreport_s3(start=harvest) - report_idstat = cuahsihismwraids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihismwraids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihismwraids0") - load_release = cuahsihismwraids0_naburelease(start=harvest) - load_uploadrelease = cuahsihismwraids0_uploadrelease(start=load_release) - - load_prune = cuahsihismwraids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihismwraids0_nabuprov(start=load_prune) - load_org = cuahsihismwraids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihismwraids0_missingreport_graph(start=load_org) - report_graph=cuahsihismwraids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnashrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnashrwaids0.py deleted file mode 100644 index f2b9f904..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnashrwaids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisnashrwaids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisnashrwaids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisnashrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnashrwaids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisnashrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnashrwaids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisnashrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnashrwaids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisnashrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnashrwaids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisnashrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisnashrwaids0_uploadrelease(context): - returned_value = postRelease("cuahsihisnashrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisnashrwaids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnashrwaids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnashrwaids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisnashrwaids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnashrwaids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnashrwaids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisnashrwaids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnashrwaids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnashrwaids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnashrwaids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnashrwaids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnashrwaids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnashrwaids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnashrwaids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisnashrwaids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisnashrwaids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisnashrwaids0(): - containers = cuahsihisnashrwaids0_getImage() - harvest = cuahsihisnashrwaids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisnashrwaids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisnashrwaids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisnashrwaids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisnashrwaids0") - load_release = cuahsihisnashrwaids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisnashrwaids0_uploadrelease(start=load_release) - - load_prune = cuahsihisnashrwaids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisnashrwaids0_nabuprov(start=load_prune) - load_org = cuahsihisnashrwaids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisnashrwaids0_missingreport_graph(start=load_org) - report_graph=cuahsihisnashrwaids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnceiww2ids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnceiww2ids0.py deleted file mode 100644 index b550f29e..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnceiww2ids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisnceiww2ids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisnceiww2ids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisnceiww2ids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnceiww2ids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisnceiww2ids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnceiww2ids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisnceiww2ids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnceiww2ids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisnceiww2ids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnceiww2ids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisnceiww2ids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisnceiww2ids0_uploadrelease(context): - returned_value = postRelease("cuahsihisnceiww2ids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisnceiww2ids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnceiww2ids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnceiww2ids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisnceiww2ids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnceiww2ids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnceiww2ids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisnceiww2ids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnceiww2ids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnceiww2ids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnceiww2ids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnceiww2ids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnceiww2ids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnceiww2ids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnceiww2ids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisnceiww2ids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisnceiww2ids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisnceiww2ids0(): - containers = cuahsihisnceiww2ids0_getImage() - harvest = cuahsihisnceiww2ids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisnceiww2ids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisnceiww2ids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisnceiww2ids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisnceiww2ids0") - load_release = cuahsihisnceiww2ids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisnceiww2ids0_uploadrelease(start=load_release) - - load_prune = cuahsihisnceiww2ids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisnceiww2ids0_nabuprov(start=load_prune) - load_org = cuahsihisnceiww2ids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisnceiww2ids0_missingreport_graph(start=load_org) - report_graph=cuahsihisnceiww2ids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisneonids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisneonids0.py deleted file mode 100644 index bf7c8463..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisneonids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisneonids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisneonids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisneonids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisneonids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisneonids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisneonids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisneonids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisneonids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisneonids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisneonids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisneonids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisneonids0_uploadrelease(context): - returned_value = postRelease("cuahsihisneonids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisneonids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisneonids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisneonids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisneonids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisneonids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisneonids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisneonids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisneonids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisneonids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisneonids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisneonids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisneonids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisneonids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisneonids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisneonids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisneonids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisneonids0(): - containers = cuahsihisneonids0_getImage() - harvest = cuahsihisneonids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisneonids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisneonids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisneonids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisneonids0") - load_release = cuahsihisneonids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisneonids0_uploadrelease(start=load_release) - - load_prune = cuahsihisneonids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisneonids0_nabuprov(start=load_prune) - load_org = cuahsihisneonids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisneonids0_missingreport_graph(start=load_org) - report_graph=cuahsihisneonids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnevadosids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnevadosids0.py deleted file mode 100644 index 80172dc8..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnevadosids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisnevadosids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisnevadosids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisnevadosids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnevadosids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisnevadosids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnevadosids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisnevadosids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnevadosids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisnevadosids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnevadosids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisnevadosids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisnevadosids0_uploadrelease(context): - returned_value = postRelease("cuahsihisnevadosids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisnevadosids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnevadosids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnevadosids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisnevadosids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnevadosids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnevadosids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisnevadosids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnevadosids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnevadosids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnevadosids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnevadosids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnevadosids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnevadosids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnevadosids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisnevadosids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisnevadosids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisnevadosids0(): - containers = cuahsihisnevadosids0_getImage() - harvest = cuahsihisnevadosids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisnevadosids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisnevadosids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisnevadosids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisnevadosids0") - load_release = cuahsihisnevadosids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisnevadosids0_uploadrelease(start=load_release) - - load_prune = cuahsihisnevadosids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisnevadosids0_nabuprov(start=load_prune) - load_org = cuahsihisnevadosids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisnevadosids0_missingreport_graph(start=load_org) - report_graph=cuahsihisnevadosids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnevcanids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnevcanids0.py deleted file mode 100644 index d7ac8d3f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnevcanids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisnevcanids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisnevcanids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisnevcanids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnevcanids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisnevcanids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnevcanids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisnevcanids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnevcanids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisnevcanids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnevcanids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisnevcanids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisnevcanids0_uploadrelease(context): - returned_value = postRelease("cuahsihisnevcanids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisnevcanids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnevcanids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnevcanids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisnevcanids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnevcanids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnevcanids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisnevcanids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnevcanids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnevcanids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnevcanids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnevcanids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnevcanids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnevcanids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnevcanids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisnevcanids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisnevcanids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisnevcanids0(): - containers = cuahsihisnevcanids0_getImage() - harvest = cuahsihisnevcanids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisnevcanids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisnevcanids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisnevcanids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisnevcanids0") - load_release = cuahsihisnevcanids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisnevcanids0_uploadrelease(start=load_release) - - load_prune = cuahsihisnevcanids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisnevcanids0_nabuprov(start=load_prune) - load_org = cuahsihisnevcanids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisnevcanids0_missingreport_graph(start=load_org) - report_graph=cuahsihisnevcanids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnewnids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnewnids0.py deleted file mode 100644 index a2309479..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnewnids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisnewnids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisnewnids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisnewnids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnewnids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisnewnids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnewnids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisnewnids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnewnids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisnewnids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnewnids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisnewnids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisnewnids0_uploadrelease(context): - returned_value = postRelease("cuahsihisnewnids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisnewnids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnewnids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnewnids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisnewnids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnewnids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnewnids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisnewnids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnewnids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnewnids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnewnids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnewnids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnewnids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnewnids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnewnids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisnewnids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisnewnids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisnewnids0(): - containers = cuahsihisnewnids0_getImage() - harvest = cuahsihisnewnids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisnewnids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisnewnids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisnewnids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisnewnids0") - load_release = cuahsihisnewnids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisnewnids0_uploadrelease(start=load_release) - - load_prune = cuahsihisnewnids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisnewnids0_nabuprov(start=load_prune) - load_org = cuahsihisnewnids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisnewnids0_missingreport_graph(start=load_org) - report_graph=cuahsihisnewnids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnhgswofids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnhgswofids0.py deleted file mode 100644 index f577368a..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnhgswofids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisnhgswofids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisnhgswofids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisnhgswofids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnhgswofids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisnhgswofids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnhgswofids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisnhgswofids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnhgswofids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisnhgswofids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnhgswofids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisnhgswofids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisnhgswofids0_uploadrelease(context): - returned_value = postRelease("cuahsihisnhgswofids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisnhgswofids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnhgswofids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnhgswofids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisnhgswofids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnhgswofids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnhgswofids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisnhgswofids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnhgswofids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnhgswofids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnhgswofids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnhgswofids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnhgswofids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnhgswofids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnhgswofids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisnhgswofids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisnhgswofids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisnhgswofids0(): - containers = cuahsihisnhgswofids0_getImage() - harvest = cuahsihisnhgswofids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisnhgswofids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisnhgswofids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisnhgswofids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisnhgswofids0") - load_release = cuahsihisnhgswofids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisnhgswofids0_uploadrelease(start=load_release) - - load_prune = cuahsihisnhgswofids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisnhgswofids0_nabuprov(start=load_prune) - load_org = cuahsihisnhgswofids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisnhgswofids0_missingreport_graph(start=load_org) - report_graph=cuahsihisnhgswofids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnooksackmicroclimatenetworkids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnooksackmicroclimatenetworkids0.py deleted file mode 100644 index 931b1b38..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisnooksackmicroclimatenetworkids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisnooksackmicroclimatenetworkids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisnooksackmicroclimatenetworkids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisnooksackmicroclimatenetworkids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnooksackmicroclimatenetworkids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisnooksackmicroclimatenetworkids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnooksackmicroclimatenetworkids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisnooksackmicroclimatenetworkids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnooksackmicroclimatenetworkids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisnooksackmicroclimatenetworkids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnooksackmicroclimatenetworkids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisnooksackmicroclimatenetworkids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisnooksackmicroclimatenetworkids0_uploadrelease(context): - returned_value = postRelease("cuahsihisnooksackmicroclimatenetworkids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisnooksackmicroclimatenetworkids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnooksackmicroclimatenetworkids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnooksackmicroclimatenetworkids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisnooksackmicroclimatenetworkids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnooksackmicroclimatenetworkids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnooksackmicroclimatenetworkids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisnooksackmicroclimatenetworkids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnooksackmicroclimatenetworkids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnooksackmicroclimatenetworkids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnooksackmicroclimatenetworkids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisnooksackmicroclimatenetworkids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnooksackmicroclimatenetworkids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisnooksackmicroclimatenetworkids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisnooksackmicroclimatenetworkids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisnooksackmicroclimatenetworkids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisnooksackmicroclimatenetworkids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisnooksackmicroclimatenetworkids0(): - containers = cuahsihisnooksackmicroclimatenetworkids0_getImage() - harvest = cuahsihisnooksackmicroclimatenetworkids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisnooksackmicroclimatenetworkids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisnooksackmicroclimatenetworkids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisnooksackmicroclimatenetworkids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisnooksackmicroclimatenetworkids0") - load_release = cuahsihisnooksackmicroclimatenetworkids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisnooksackmicroclimatenetworkids0_uploadrelease(start=load_release) - - load_prune = cuahsihisnooksackmicroclimatenetworkids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisnooksackmicroclimatenetworkids0_nabuprov(start=load_prune) - load_org = cuahsihisnooksackmicroclimatenetworkids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisnooksackmicroclimatenetworkids0_missingreport_graph(start=load_org) - report_graph=cuahsihisnooksackmicroclimatenetworkids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisodmkentstateids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisodmkentstateids0.py deleted file mode 100644 index c8d538f2..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisodmkentstateids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisodmkentstateids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisodmkentstateids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisodmkentstateids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisodmkentstateids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisodmkentstateids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisodmkentstateids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisodmkentstateids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisodmkentstateids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisodmkentstateids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisodmkentstateids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisodmkentstateids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisodmkentstateids0_uploadrelease(context): - returned_value = postRelease("cuahsihisodmkentstateids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisodmkentstateids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisodmkentstateids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisodmkentstateids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisodmkentstateids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisodmkentstateids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisodmkentstateids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisodmkentstateids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisodmkentstateids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisodmkentstateids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisodmkentstateids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisodmkentstateids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisodmkentstateids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisodmkentstateids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisodmkentstateids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisodmkentstateids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisodmkentstateids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisodmkentstateids0(): - containers = cuahsihisodmkentstateids0_getImage() - harvest = cuahsihisodmkentstateids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisodmkentstateids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisodmkentstateids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisodmkentstateids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisodmkentstateids0") - load_release = cuahsihisodmkentstateids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisodmkentstateids0_uploadrelease(start=load_release) - - load_prune = cuahsihisodmkentstateids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisodmkentstateids0_nabuprov(start=load_prune) - load_org = cuahsihisodmkentstateids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisodmkentstateids0_missingreport_graph(start=load_org) - report_graph=cuahsihisodmkentstateids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisorsancohabids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisorsancohabids0.py deleted file mode 100644 index cdf26706..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisorsancohabids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisorsancohabids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisorsancohabids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisorsancohabids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisorsancohabids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisorsancohabids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisorsancohabids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisorsancohabids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisorsancohabids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisorsancohabids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisorsancohabids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisorsancohabids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisorsancohabids0_uploadrelease(context): - returned_value = postRelease("cuahsihisorsancohabids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisorsancohabids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisorsancohabids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisorsancohabids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisorsancohabids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisorsancohabids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisorsancohabids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisorsancohabids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisorsancohabids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisorsancohabids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisorsancohabids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisorsancohabids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisorsancohabids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisorsancohabids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisorsancohabids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisorsancohabids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisorsancohabids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisorsancohabids0(): - containers = cuahsihisorsancohabids0_getImage() - harvest = cuahsihisorsancohabids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisorsancohabids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisorsancohabids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisorsancohabids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisorsancohabids0") - load_release = cuahsihisorsancohabids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisorsancohabids0_uploadrelease(start=load_release) - - load_prune = cuahsihisorsancohabids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisorsancohabids0_nabuprov(start=load_prune) - load_org = cuahsihisorsancohabids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisorsancohabids0_missingreport_graph(start=load_org) - report_graph=cuahsihisorsancohabids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihispanolaodmids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihispanolaodmids0.py deleted file mode 100644 index 84eb6651..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihispanolaodmids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihispanolaodmids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihispanolaodmids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihispanolaodmids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihispanolaodmids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihispanolaodmids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihispanolaodmids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihispanolaodmids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihispanolaodmids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihispanolaodmids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihispanolaodmids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihispanolaodmids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihispanolaodmids0_uploadrelease(context): - returned_value = postRelease("cuahsihispanolaodmids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihispanolaodmids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihispanolaodmids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihispanolaodmids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihispanolaodmids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihispanolaodmids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihispanolaodmids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihispanolaodmids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihispanolaodmids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihispanolaodmids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihispanolaodmids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihispanolaodmids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihispanolaodmids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihispanolaodmids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihispanolaodmids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihispanolaodmids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihispanolaodmids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihispanolaodmids0(): - containers = cuahsihispanolaodmids0_getImage() - harvest = cuahsihispanolaodmids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihispanolaodmids0_missingreport_s3(start=harvest) - report_idstat = cuahsihispanolaodmids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihispanolaodmids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihispanolaodmids0") - load_release = cuahsihispanolaodmids0_naburelease(start=harvest) - load_uploadrelease = cuahsihispanolaodmids0_uploadrelease(start=load_release) - - load_prune = cuahsihispanolaodmids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihispanolaodmids0_nabuprov(start=load_prune) - load_org = cuahsihispanolaodmids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihispanolaodmids0_missingreport_graph(start=load_org) - report_graph=cuahsihispanolaodmids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisparalanaturalezaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisparalanaturalezaids0.py deleted file mode 100644 index 1aba516e..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisparalanaturalezaids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisparalanaturalezaids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisparalanaturalezaids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisparalanaturalezaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisparalanaturalezaids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisparalanaturalezaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisparalanaturalezaids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisparalanaturalezaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisparalanaturalezaids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisparalanaturalezaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisparalanaturalezaids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisparalanaturalezaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisparalanaturalezaids0_uploadrelease(context): - returned_value = postRelease("cuahsihisparalanaturalezaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisparalanaturalezaids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisparalanaturalezaids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisparalanaturalezaids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisparalanaturalezaids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisparalanaturalezaids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisparalanaturalezaids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisparalanaturalezaids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisparalanaturalezaids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisparalanaturalezaids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisparalanaturalezaids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisparalanaturalezaids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisparalanaturalezaids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisparalanaturalezaids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisparalanaturalezaids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisparalanaturalezaids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisparalanaturalezaids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisparalanaturalezaids0(): - containers = cuahsihisparalanaturalezaids0_getImage() - harvest = cuahsihisparalanaturalezaids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisparalanaturalezaids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisparalanaturalezaids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisparalanaturalezaids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisparalanaturalezaids0") - load_release = cuahsihisparalanaturalezaids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisparalanaturalezaids0_uploadrelease(start=load_release) - - load_prune = cuahsihisparalanaturalezaids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisparalanaturalezaids0_nabuprov(start=load_prune) - load_org = cuahsihisparalanaturalezaids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisparalanaturalezaids0_missingreport_graph(start=load_org) - report_graph=cuahsihisparalanaturalezaids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisprovorivergamutids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisprovorivergamutids0.py deleted file mode 100644 index 5862e9cb..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisprovorivergamutids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisprovorivergamutids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisprovorivergamutids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisprovorivergamutids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisprovorivergamutids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisprovorivergamutids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisprovorivergamutids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisprovorivergamutids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisprovorivergamutids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisprovorivergamutids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisprovorivergamutids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisprovorivergamutids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisprovorivergamutids0_uploadrelease(context): - returned_value = postRelease("cuahsihisprovorivergamutids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisprovorivergamutids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisprovorivergamutids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisprovorivergamutids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisprovorivergamutids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisprovorivergamutids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisprovorivergamutids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisprovorivergamutids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisprovorivergamutids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisprovorivergamutids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisprovorivergamutids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisprovorivergamutids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisprovorivergamutids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisprovorivergamutids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisprovorivergamutids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisprovorivergamutids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisprovorivergamutids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisprovorivergamutids0(): - containers = cuahsihisprovorivergamutids0_getImage() - harvest = cuahsihisprovorivergamutids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisprovorivergamutids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisprovorivergamutids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisprovorivergamutids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisprovorivergamutids0") - load_release = cuahsihisprovorivergamutids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisprovorivergamutids0_uploadrelease(start=load_release) - - load_prune = cuahsihisprovorivergamutids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisprovorivergamutids0_nabuprov(start=load_prune) - load_org = cuahsihisprovorivergamutids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisprovorivergamutids0_missingreport_graph(start=load_org) - report_graph=cuahsihisprovorivergamutids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisredbuttecreekgamutids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisredbuttecreekgamutids0.py deleted file mode 100644 index 4d4ffa61..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisredbuttecreekgamutids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisredbuttecreekgamutids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisredbuttecreekgamutids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisredbuttecreekgamutids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisredbuttecreekgamutids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisredbuttecreekgamutids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisredbuttecreekgamutids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisredbuttecreekgamutids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisredbuttecreekgamutids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisredbuttecreekgamutids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisredbuttecreekgamutids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisredbuttecreekgamutids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisredbuttecreekgamutids0_uploadrelease(context): - returned_value = postRelease("cuahsihisredbuttecreekgamutids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisredbuttecreekgamutids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisredbuttecreekgamutids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisredbuttecreekgamutids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisredbuttecreekgamutids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisredbuttecreekgamutids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisredbuttecreekgamutids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisredbuttecreekgamutids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisredbuttecreekgamutids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisredbuttecreekgamutids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisredbuttecreekgamutids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisredbuttecreekgamutids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisredbuttecreekgamutids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisredbuttecreekgamutids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisredbuttecreekgamutids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisredbuttecreekgamutids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisredbuttecreekgamutids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisredbuttecreekgamutids0(): - containers = cuahsihisredbuttecreekgamutids0_getImage() - harvest = cuahsihisredbuttecreekgamutids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisredbuttecreekgamutids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisredbuttecreekgamutids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisredbuttecreekgamutids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisredbuttecreekgamutids0") - load_release = cuahsihisredbuttecreekgamutids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisredbuttecreekgamutids0_uploadrelease(start=load_release) - - load_prune = cuahsihisredbuttecreekgamutids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisredbuttecreekgamutids0_nabuprov(start=load_prune) - load_org = cuahsihisredbuttecreekgamutids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisredbuttecreekgamutids0_missingreport_graph(start=load_org) - report_graph=cuahsihisredbuttecreekgamutids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisrmblids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisrmblids0.py deleted file mode 100644 index 4567b8cc..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisrmblids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisrmblids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisrmblids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisrmblids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisrmblids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisrmblids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisrmblids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisrmblids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisrmblids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisrmblids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisrmblids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisrmblids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisrmblids0_uploadrelease(context): - returned_value = postRelease("cuahsihisrmblids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisrmblids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisrmblids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisrmblids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisrmblids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisrmblids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisrmblids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisrmblids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisrmblids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisrmblids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisrmblids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisrmblids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisrmblids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisrmblids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisrmblids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisrmblids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisrmblids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisrmblids0(): - containers = cuahsihisrmblids0_getImage() - harvest = cuahsihisrmblids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisrmblids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisrmblids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisrmblids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisrmblids0") - load_release = cuahsihisrmblids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisrmblids0_uploadrelease(start=load_release) - - load_prune = cuahsihisrmblids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisrmblids0_nabuprov(start=load_prune) - load_org = cuahsihisrmblids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisrmblids0_missingreport_graph(start=load_org) - report_graph=cuahsihisrmblids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihissagehencreekids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihissagehencreekids0.py deleted file mode 100644 index 36095daf..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihissagehencreekids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihissagehencreekids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihissagehencreekids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihissagehencreekids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihissagehencreekids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihissagehencreekids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihissagehencreekids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihissagehencreekids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihissagehencreekids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihissagehencreekids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihissagehencreekids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihissagehencreekids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihissagehencreekids0_uploadrelease(context): - returned_value = postRelease("cuahsihissagehencreekids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihissagehencreekids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihissagehencreekids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihissagehencreekids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihissagehencreekids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihissagehencreekids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihissagehencreekids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihissagehencreekids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihissagehencreekids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihissagehencreekids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihissagehencreekids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihissagehencreekids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihissagehencreekids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihissagehencreekids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihissagehencreekids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihissagehencreekids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihissagehencreekids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihissagehencreekids0(): - containers = cuahsihissagehencreekids0_getImage() - harvest = cuahsihissagehencreekids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihissagehencreekids0_missingreport_s3(start=harvest) - report_idstat = cuahsihissagehencreekids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihissagehencreekids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihissagehencreekids0") - load_release = cuahsihissagehencreekids0_naburelease(start=harvest) - load_uploadrelease = cuahsihissagehencreekids0_uploadrelease(start=load_release) - - load_prune = cuahsihissagehencreekids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihissagehencreekids0_nabuprov(start=load_prune) - load_org = cuahsihissagehencreekids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihissagehencreekids0_missingreport_graph(start=load_org) - report_graph=cuahsihissagehencreekids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisscanids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisscanids0.py deleted file mode 100644 index 96432ecd..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisscanids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisscanids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisscanids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisscanids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisscanids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisscanids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisscanids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisscanids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisscanids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisscanids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisscanids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisscanids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisscanids0_uploadrelease(context): - returned_value = postRelease("cuahsihisscanids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisscanids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisscanids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisscanids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisscanids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisscanids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisscanids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisscanids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisscanids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisscanids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisscanids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisscanids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisscanids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisscanids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisscanids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisscanids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisscanids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisscanids0(): - containers = cuahsihisscanids0_getImage() - harvest = cuahsihisscanids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisscanids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisscanids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisscanids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisscanids0") - load_release = cuahsihisscanids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisscanids0_uploadrelease(start=load_release) - - load_prune = cuahsihisscanids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisscanids0_nabuprov(start=load_prune) - load_org = cuahsihisscanids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisscanids0_missingreport_graph(start=load_org) - report_graph=cuahsihisscanids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisscotlandnwisids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisscotlandnwisids0.py deleted file mode 100644 index 17111bde..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisscotlandnwisids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisscotlandnwisids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisscotlandnwisids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisscotlandnwisids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisscotlandnwisids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisscotlandnwisids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisscotlandnwisids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisscotlandnwisids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisscotlandnwisids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisscotlandnwisids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisscotlandnwisids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisscotlandnwisids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisscotlandnwisids0_uploadrelease(context): - returned_value = postRelease("cuahsihisscotlandnwisids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisscotlandnwisids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisscotlandnwisids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisscotlandnwisids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisscotlandnwisids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisscotlandnwisids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisscotlandnwisids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisscotlandnwisids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisscotlandnwisids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisscotlandnwisids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisscotlandnwisids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisscotlandnwisids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisscotlandnwisids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisscotlandnwisids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisscotlandnwisids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisscotlandnwisids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisscotlandnwisids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisscotlandnwisids0(): - containers = cuahsihisscotlandnwisids0_getImage() - harvest = cuahsihisscotlandnwisids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisscotlandnwisids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisscotlandnwisids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisscotlandnwisids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisscotlandnwisids0") - load_release = cuahsihisscotlandnwisids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisscotlandnwisids0_uploadrelease(start=load_release) - - load_prune = cuahsihisscotlandnwisids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisscotlandnwisids0_nabuprov(start=load_prune) - load_org = cuahsihisscotlandnwisids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisscotlandnwisids0_missingreport_graph(start=load_org) - report_graph=cuahsihisscotlandnwisids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisshalenetworkodmids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisshalenetworkodmids0.py deleted file mode 100644 index e2b56ed5..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisshalenetworkodmids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisshalenetworkodmids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisshalenetworkodmids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisshalenetworkodmids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisshalenetworkodmids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisshalenetworkodmids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisshalenetworkodmids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisshalenetworkodmids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisshalenetworkodmids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisshalenetworkodmids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisshalenetworkodmids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisshalenetworkodmids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisshalenetworkodmids0_uploadrelease(context): - returned_value = postRelease("cuahsihisshalenetworkodmids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisshalenetworkodmids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisshalenetworkodmids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisshalenetworkodmids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisshalenetworkodmids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisshalenetworkodmids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisshalenetworkodmids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisshalenetworkodmids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisshalenetworkodmids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisshalenetworkodmids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisshalenetworkodmids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisshalenetworkodmids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisshalenetworkodmids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisshalenetworkodmids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisshalenetworkodmids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisshalenetworkodmids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisshalenetworkodmids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisshalenetworkodmids0(): - containers = cuahsihisshalenetworkodmids0_getImage() - harvest = cuahsihisshalenetworkodmids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisshalenetworkodmids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisshalenetworkodmids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisshalenetworkodmids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisshalenetworkodmids0") - load_release = cuahsihisshalenetworkodmids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisshalenetworkodmids0_uploadrelease(start=load_release) - - load_prune = cuahsihisshalenetworkodmids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisshalenetworkodmids0_nabuprov(start=load_prune) - load_org = cuahsihisshalenetworkodmids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisshalenetworkodmids0_missingreport_graph(start=load_org) - report_graph=cuahsihisshalenetworkodmids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisshalenetworkodmids1.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisshalenetworkodmids1.py deleted file mode 100644 index 53689929..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisshalenetworkodmids1.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisshalenetworkodmids1_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisshalenetworkodmids1_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisshalenetworkodmids1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisshalenetworkodmids1_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisshalenetworkodmids1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisshalenetworkodmids1_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisshalenetworkodmids1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisshalenetworkodmids1_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisshalenetworkodmids1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisshalenetworkodmids1_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisshalenetworkodmids1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisshalenetworkodmids1_uploadrelease(context): - returned_value = postRelease("cuahsihisshalenetworkodmids1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisshalenetworkodmids1_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisshalenetworkodmids1") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisshalenetworkodmids1" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisshalenetworkodmids1_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisshalenetworkodmids1") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisshalenetworkodmids1" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisshalenetworkodmids1_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisshalenetworkodmids1") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisshalenetworkodmids1" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisshalenetworkodmids1_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisshalenetworkodmids1") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisshalenetworkodmids1" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisshalenetworkodmids1_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisshalenetworkodmids1" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisshalenetworkodmids1"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisshalenetworkodmids1" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisshalenetworkodmids1(): - containers = cuahsihisshalenetworkodmids1_getImage() - harvest = cuahsihisshalenetworkodmids1_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisshalenetworkodmids1_missingreport_s3(start=harvest) - report_idstat = cuahsihisshalenetworkodmids1_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisshalenetworkodmids1_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisshalenetworkodmids1") - load_release = cuahsihisshalenetworkodmids1_naburelease(start=harvest) - load_uploadrelease = cuahsihisshalenetworkodmids1_uploadrelease(start=load_release) - - load_prune = cuahsihisshalenetworkodmids1_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisshalenetworkodmids1_nabuprov(start=load_prune) - load_org = cuahsihisshalenetworkodmids1_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisshalenetworkodmids1_missingreport_graph(start=load_org) - report_graph=cuahsihisshalenetworkodmids1_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisskcmilltownids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisskcmilltownids0.py deleted file mode 100644 index 3244cded..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisskcmilltownids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisskcmilltownids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisskcmilltownids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisskcmilltownids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisskcmilltownids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisskcmilltownids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisskcmilltownids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisskcmilltownids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisskcmilltownids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisskcmilltownids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisskcmilltownids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisskcmilltownids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisskcmilltownids0_uploadrelease(context): - returned_value = postRelease("cuahsihisskcmilltownids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisskcmilltownids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisskcmilltownids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisskcmilltownids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisskcmilltownids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisskcmilltownids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisskcmilltownids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisskcmilltownids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisskcmilltownids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisskcmilltownids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisskcmilltownids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisskcmilltownids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisskcmilltownids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisskcmilltownids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisskcmilltownids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisskcmilltownids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisskcmilltownids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisskcmilltownids0(): - containers = cuahsihisskcmilltownids0_getImage() - harvest = cuahsihisskcmilltownids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisskcmilltownids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisskcmilltownids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisskcmilltownids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisskcmilltownids0") - load_release = cuahsihisskcmilltownids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisskcmilltownids0_uploadrelease(start=load_release) - - load_prune = cuahsihisskcmilltownids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisskcmilltownids0_nabuprov(start=load_prune) - load_org = cuahsihisskcmilltownids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisskcmilltownids0_missingreport_graph(start=load_org) - report_graph=cuahsihisskcmilltownids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihissnotelids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihissnotelids0.py deleted file mode 100644 index 94717ef7..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihissnotelids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihissnotelids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihissnotelids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihissnotelids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihissnotelids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihissnotelids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihissnotelids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihissnotelids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihissnotelids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihissnotelids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihissnotelids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihissnotelids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihissnotelids0_uploadrelease(context): - returned_value = postRelease("cuahsihissnotelids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihissnotelids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihissnotelids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihissnotelids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihissnotelids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihissnotelids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihissnotelids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihissnotelids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihissnotelids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihissnotelids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihissnotelids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihissnotelids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihissnotelids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihissnotelids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihissnotelids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihissnotelids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihissnotelids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihissnotelids0(): - containers = cuahsihissnotelids0_getImage() - harvest = cuahsihissnotelids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihissnotelids0_missingreport_s3(start=harvest) - report_idstat = cuahsihissnotelids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihissnotelids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihissnotelids0") - load_release = cuahsihissnotelids0_naburelease(start=harvest) - load_uploadrelease = cuahsihissnotelids0_uploadrelease(start=load_release) - - load_prune = cuahsihissnotelids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihissnotelids0_nabuprov(start=load_prune) - load_org = cuahsihissnotelids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihissnotelids0_missingreport_graph(start=load_org) - report_graph=cuahsihissnotelids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisswedishmonitoringdataids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisswedishmonitoringdataids0.py deleted file mode 100644 index 01549a00..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisswedishmonitoringdataids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisswedishmonitoringdataids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisswedishmonitoringdataids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisswedishmonitoringdataids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisswedishmonitoringdataids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisswedishmonitoringdataids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisswedishmonitoringdataids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisswedishmonitoringdataids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisswedishmonitoringdataids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisswedishmonitoringdataids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisswedishmonitoringdataids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisswedishmonitoringdataids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisswedishmonitoringdataids0_uploadrelease(context): - returned_value = postRelease("cuahsihisswedishmonitoringdataids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisswedishmonitoringdataids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisswedishmonitoringdataids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisswedishmonitoringdataids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisswedishmonitoringdataids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisswedishmonitoringdataids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisswedishmonitoringdataids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisswedishmonitoringdataids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisswedishmonitoringdataids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisswedishmonitoringdataids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisswedishmonitoringdataids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisswedishmonitoringdataids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisswedishmonitoringdataids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisswedishmonitoringdataids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisswedishmonitoringdataids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisswedishmonitoringdataids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisswedishmonitoringdataids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisswedishmonitoringdataids0(): - containers = cuahsihisswedishmonitoringdataids0_getImage() - harvest = cuahsihisswedishmonitoringdataids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisswedishmonitoringdataids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisswedishmonitoringdataids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisswedishmonitoringdataids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisswedishmonitoringdataids0") - load_release = cuahsihisswedishmonitoringdataids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisswedishmonitoringdataids0_uploadrelease(start=load_release) - - load_prune = cuahsihisswedishmonitoringdataids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisswedishmonitoringdataids0_nabuprov(start=load_prune) - load_org = cuahsihisswedishmonitoringdataids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisswedishmonitoringdataids0_missingreport_graph(start=load_org) - report_graph=cuahsihisswedishmonitoringdataids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistarlandwaterqualityids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistarlandwaterqualityids0.py deleted file mode 100644 index 00a25bea..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistarlandwaterqualityids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihistarlandwaterqualityids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihistarlandwaterqualityids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihistarlandwaterqualityids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistarlandwaterqualityids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihistarlandwaterqualityids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistarlandwaterqualityids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihistarlandwaterqualityids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistarlandwaterqualityids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihistarlandwaterqualityids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistarlandwaterqualityids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihistarlandwaterqualityids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihistarlandwaterqualityids0_uploadrelease(context): - returned_value = postRelease("cuahsihistarlandwaterqualityids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihistarlandwaterqualityids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistarlandwaterqualityids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihistarlandwaterqualityids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihistarlandwaterqualityids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistarlandwaterqualityids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihistarlandwaterqualityids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihistarlandwaterqualityids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistarlandwaterqualityids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihistarlandwaterqualityids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistarlandwaterqualityids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistarlandwaterqualityids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihistarlandwaterqualityids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistarlandwaterqualityids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihistarlandwaterqualityids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihistarlandwaterqualityids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihistarlandwaterqualityids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihistarlandwaterqualityids0(): - containers = cuahsihistarlandwaterqualityids0_getImage() - harvest = cuahsihistarlandwaterqualityids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihistarlandwaterqualityids0_missingreport_s3(start=harvest) - report_idstat = cuahsihistarlandwaterqualityids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihistarlandwaterqualityids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihistarlandwaterqualityids0") - load_release = cuahsihistarlandwaterqualityids0_naburelease(start=harvest) - load_uploadrelease = cuahsihistarlandwaterqualityids0_uploadrelease(start=load_release) - - load_prune = cuahsihistarlandwaterqualityids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihistarlandwaterqualityids0_nabuprov(start=load_prune) - load_org = cuahsihistarlandwaterqualityids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihistarlandwaterqualityids0_missingreport_graph(start=load_org) - report_graph=cuahsihistarlandwaterqualityids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistncwaterdataids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistncwaterdataids0.py deleted file mode 100644 index 540be88d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistncwaterdataids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihistncwaterdataids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihistncwaterdataids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihistncwaterdataids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistncwaterdataids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihistncwaterdataids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistncwaterdataids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihistncwaterdataids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistncwaterdataids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihistncwaterdataids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistncwaterdataids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihistncwaterdataids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihistncwaterdataids0_uploadrelease(context): - returned_value = postRelease("cuahsihistncwaterdataids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihistncwaterdataids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistncwaterdataids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihistncwaterdataids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihistncwaterdataids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistncwaterdataids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihistncwaterdataids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihistncwaterdataids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistncwaterdataids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihistncwaterdataids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistncwaterdataids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistncwaterdataids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihistncwaterdataids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistncwaterdataids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihistncwaterdataids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihistncwaterdataids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihistncwaterdataids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihistncwaterdataids0(): - containers = cuahsihistncwaterdataids0_getImage() - harvest = cuahsihistncwaterdataids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihistncwaterdataids0_missingreport_s3(start=harvest) - report_idstat = cuahsihistncwaterdataids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihistncwaterdataids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihistncwaterdataids0") - load_release = cuahsihistncwaterdataids0_naburelease(start=harvest) - load_uploadrelease = cuahsihistncwaterdataids0_uploadrelease(start=load_release) - - load_prune = cuahsihistncwaterdataids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihistncwaterdataids0_nabuprov(start=load_prune) - load_org = cuahsihistncwaterdataids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihistncwaterdataids0_missingreport_graph(start=load_org) - report_graph=cuahsihistncwaterdataids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistrwaids0.py deleted file mode 100644 index 50476d8f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistrwaids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihistrwaids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihistrwaids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihistrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistrwaids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihistrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistrwaids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihistrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistrwaids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihistrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistrwaids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihistrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihistrwaids0_uploadrelease(context): - returned_value = postRelease("cuahsihistrwaids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihistrwaids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistrwaids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihistrwaids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihistrwaids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistrwaids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihistrwaids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihistrwaids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistrwaids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihistrwaids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistrwaids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistrwaids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihistrwaids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistrwaids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihistrwaids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihistrwaids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihistrwaids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihistrwaids0(): - containers = cuahsihistrwaids0_getImage() - harvest = cuahsihistrwaids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihistrwaids0_missingreport_s3(start=harvest) - report_idstat = cuahsihistrwaids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihistrwaids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihistrwaids0") - load_release = cuahsihistrwaids0_naburelease(start=harvest) - load_uploadrelease = cuahsihistrwaids0_uploadrelease(start=load_release) - - load_prune = cuahsihistrwaids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihistrwaids0_nabuprov(start=load_prune) - load_org = cuahsihistrwaids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihistrwaids0_missingreport_graph(start=load_org) - report_graph=cuahsihistrwaids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistuolumnemdwids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistuolumnemdwids0.py deleted file mode 100644 index 32535a1f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihistuolumnemdwids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihistuolumnemdwids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihistuolumnemdwids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihistuolumnemdwids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistuolumnemdwids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihistuolumnemdwids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistuolumnemdwids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihistuolumnemdwids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistuolumnemdwids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihistuolumnemdwids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistuolumnemdwids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihistuolumnemdwids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihistuolumnemdwids0_uploadrelease(context): - returned_value = postRelease("cuahsihistuolumnemdwids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihistuolumnemdwids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistuolumnemdwids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihistuolumnemdwids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihistuolumnemdwids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistuolumnemdwids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihistuolumnemdwids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihistuolumnemdwids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistuolumnemdwids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihistuolumnemdwids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistuolumnemdwids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihistuolumnemdwids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihistuolumnemdwids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihistuolumnemdwids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihistuolumnemdwids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihistuolumnemdwids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihistuolumnemdwids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihistuolumnemdwids0(): - containers = cuahsihistuolumnemdwids0_getImage() - harvest = cuahsihistuolumnemdwids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihistuolumnemdwids0_missingreport_s3(start=harvest) - report_idstat = cuahsihistuolumnemdwids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihistuolumnemdwids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihistuolumnemdwids0") - load_release = cuahsihistuolumnemdwids0_naburelease(start=harvest) - load_uploadrelease = cuahsihistuolumnemdwids0_uploadrelease(start=load_release) - - load_prune = cuahsihistuolumnemdwids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihistuolumnemdwids0_nabuprov(start=load_prune) - load_org = cuahsihistuolumnemdwids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihistuolumnemdwids0_missingreport_graph(start=load_org) - report_graph=cuahsihistuolumnemdwids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisubwpadids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisubwpadids0.py deleted file mode 100644 index b1a7ebb3..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisubwpadids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisubwpadids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisubwpadids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisubwpadids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisubwpadids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisubwpadids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisubwpadids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisubwpadids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisubwpadids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisubwpadids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisubwpadids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisubwpadids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisubwpadids0_uploadrelease(context): - returned_value = postRelease("cuahsihisubwpadids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisubwpadids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisubwpadids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisubwpadids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisubwpadids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisubwpadids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisubwpadids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisubwpadids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisubwpadids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisubwpadids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisubwpadids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisubwpadids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisubwpadids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisubwpadids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisubwpadids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisubwpadids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisubwpadids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisubwpadids0(): - containers = cuahsihisubwpadids0_getImage() - harvest = cuahsihisubwpadids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisubwpadids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisubwpadids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisubwpadids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisubwpadids0") - load_release = cuahsihisubwpadids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisubwpadids0_uploadrelease(start=load_release) - - load_prune = cuahsihisubwpadids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisubwpadids0_nabuprov(start=load_prune) - load_org = cuahsihisubwpadids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisubwpadids0_missingreport_graph(start=load_org) - report_graph=cuahsihisubwpadids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisumbcgwids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisumbcgwids0.py deleted file mode 100644 index 79ad1de7..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisumbcgwids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisumbcgwids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisumbcgwids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisumbcgwids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisumbcgwids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisumbcgwids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisumbcgwids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisumbcgwids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisumbcgwids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisumbcgwids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisumbcgwids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisumbcgwids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisumbcgwids0_uploadrelease(context): - returned_value = postRelease("cuahsihisumbcgwids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisumbcgwids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisumbcgwids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisumbcgwids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisumbcgwids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisumbcgwids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisumbcgwids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisumbcgwids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisumbcgwids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisumbcgwids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisumbcgwids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisumbcgwids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisumbcgwids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisumbcgwids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisumbcgwids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisumbcgwids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisumbcgwids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisumbcgwids0(): - containers = cuahsihisumbcgwids0_getImage() - harvest = cuahsihisumbcgwids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisumbcgwids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisumbcgwids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisumbcgwids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisumbcgwids0") - load_release = cuahsihisumbcgwids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisumbcgwids0_uploadrelease(start=load_release) - - load_prune = cuahsihisumbcgwids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisumbcgwids0_nabuprov(start=load_prune) - load_org = cuahsihisumbcgwids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisumbcgwids0_missingreport_graph(start=load_org) - report_graph=cuahsihisumbcgwids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisumbcwqids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisumbcwqids0.py deleted file mode 100644 index 7f338f30..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisumbcwqids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisumbcwqids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisumbcwqids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisumbcwqids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisumbcwqids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisumbcwqids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisumbcwqids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisumbcwqids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisumbcwqids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisumbcwqids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisumbcwqids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisumbcwqids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisumbcwqids0_uploadrelease(context): - returned_value = postRelease("cuahsihisumbcwqids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisumbcwqids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisumbcwqids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisumbcwqids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisumbcwqids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisumbcwqids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisumbcwqids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisumbcwqids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisumbcwqids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisumbcwqids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisumbcwqids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisumbcwqids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisumbcwqids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisumbcwqids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisumbcwqids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisumbcwqids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisumbcwqids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisumbcwqids0(): - containers = cuahsihisumbcwqids0_getImage() - harvest = cuahsihisumbcwqids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisumbcwqids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisumbcwqids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisumbcwqids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisumbcwqids0") - load_release = cuahsihisumbcwqids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisumbcwqids0_uploadrelease(start=load_release) - - load_prune = cuahsihisumbcwqids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisumbcwqids0_nabuprov(start=load_prune) - load_org = cuahsihisumbcwqids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisumbcwqids0_missingreport_graph(start=load_org) - report_graph=cuahsihisumbcwqids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisunhsnowids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisunhsnowids0.py deleted file mode 100644 index d5885d6b..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisunhsnowids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisunhsnowids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisunhsnowids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisunhsnowids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisunhsnowids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisunhsnowids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisunhsnowids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisunhsnowids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisunhsnowids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisunhsnowids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisunhsnowids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisunhsnowids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisunhsnowids0_uploadrelease(context): - returned_value = postRelease("cuahsihisunhsnowids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisunhsnowids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisunhsnowids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisunhsnowids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisunhsnowids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisunhsnowids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisunhsnowids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisunhsnowids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisunhsnowids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisunhsnowids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisunhsnowids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisunhsnowids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisunhsnowids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisunhsnowids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisunhsnowids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisunhsnowids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisunhsnowids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisunhsnowids0(): - containers = cuahsihisunhsnowids0_getImage() - harvest = cuahsihisunhsnowids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisunhsnowids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisunhsnowids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisunhsnowids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisunhsnowids0") - load_release = cuahsihisunhsnowids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisunhsnowids0_uploadrelease(start=load_release) - - load_prune = cuahsihisunhsnowids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisunhsnowids0_nabuprov(start=load_prune) - load_org = cuahsihisunhsnowids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisunhsnowids0_missingreport_graph(start=load_org) - report_graph=cuahsihisunhsnowids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisweiherbachids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisweiherbachids0.py deleted file mode 100644 index e9033092..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisweiherbachids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisweiherbachids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisweiherbachids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisweiherbachids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisweiherbachids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisweiherbachids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisweiherbachids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisweiherbachids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisweiherbachids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisweiherbachids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisweiherbachids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisweiherbachids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisweiherbachids0_uploadrelease(context): - returned_value = postRelease("cuahsihisweiherbachids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisweiherbachids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisweiherbachids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisweiherbachids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisweiherbachids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisweiherbachids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisweiherbachids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisweiherbachids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisweiherbachids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisweiherbachids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisweiherbachids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisweiherbachids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisweiherbachids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisweiherbachids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisweiherbachids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisweiherbachids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisweiherbachids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisweiherbachids0(): - containers = cuahsihisweiherbachids0_getImage() - harvest = cuahsihisweiherbachids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisweiherbachids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisweiherbachids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisweiherbachids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisweiherbachids0") - load_release = cuahsihisweiherbachids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisweiherbachids0_uploadrelease(start=load_release) - - load_prune = cuahsihisweiherbachids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisweiherbachids0_nabuprov(start=load_prune) - load_org = cuahsihisweiherbachids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisweiherbachids0_missingreport_graph(start=load_org) - report_graph=cuahsihisweiherbachids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisyosemitehydroclimatenetworkids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisyosemitehydroclimatenetworkids0.py deleted file mode 100644 index b675d549..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_cuahsihisyosemitehydroclimatenetworkids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def cuahsihisyosemitehydroclimatenetworkids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def cuahsihisyosemitehydroclimatenetworkids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "cuahsihisyosemitehydroclimatenetworkids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisyosemitehydroclimatenetworkids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "cuahsihisyosemitehydroclimatenetworkids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisyosemitehydroclimatenetworkids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "cuahsihisyosemitehydroclimatenetworkids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisyosemitehydroclimatenetworkids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "cuahsihisyosemitehydroclimatenetworkids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisyosemitehydroclimatenetworkids0_naburelease(context): - returned_value = gleanerio(context,("release"), "cuahsihisyosemitehydroclimatenetworkids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisyosemitehydroclimatenetworkids0_uploadrelease(context): - returned_value = postRelease("cuahsihisyosemitehydroclimatenetworkids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def cuahsihisyosemitehydroclimatenetworkids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisyosemitehydroclimatenetworkids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisyosemitehydroclimatenetworkids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisyosemitehydroclimatenetworkids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisyosemitehydroclimatenetworkids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisyosemitehydroclimatenetworkids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def cuahsihisyosemitehydroclimatenetworkids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisyosemitehydroclimatenetworkids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisyosemitehydroclimatenetworkids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisyosemitehydroclimatenetworkids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="cuahsihisyosemitehydroclimatenetworkids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisyosemitehydroclimatenetworkids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def cuahsihisyosemitehydroclimatenetworkids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "cuahsihisyosemitehydroclimatenetworkids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="cuahsihisyosemitehydroclimatenetworkids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="cuahsihisyosemitehydroclimatenetworkids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_cuahsihisyosemitehydroclimatenetworkids0(): - containers = cuahsihisyosemitehydroclimatenetworkids0_getImage() - harvest = cuahsihisyosemitehydroclimatenetworkids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = cuahsihisyosemitehydroclimatenetworkids0_missingreport_s3(start=harvest) - report_idstat = cuahsihisyosemitehydroclimatenetworkids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = cuahsihisyosemitehydroclimatenetworkids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="cuahsihisyosemitehydroclimatenetworkids0") - load_release = cuahsihisyosemitehydroclimatenetworkids0_naburelease(start=harvest) - load_uploadrelease = cuahsihisyosemitehydroclimatenetworkids0_uploadrelease(start=load_release) - - load_prune = cuahsihisyosemitehydroclimatenetworkids0_nabu_prune(start=load_uploadrelease) - load_prov = cuahsihisyosemitehydroclimatenetworkids0_nabuprov(start=load_prune) - load_org = cuahsihisyosemitehydroclimatenetworkids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=cuahsihisyosemitehydroclimatenetworkids0_missingreport_graph(start=load_org) - report_graph=cuahsihisyosemitehydroclimatenetworkids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_dams0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_dams0.py deleted file mode 100644 index 8ce6f1c8..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_dams0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def dams0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def dams0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "dams0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def dams0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "dams0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def dams0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "dams0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def dams0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "dams0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def dams0_naburelease(context): - returned_value = gleanerio(context,("release"), "dams0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def dams0_uploadrelease(context): - returned_value = postRelease("dams0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def dams0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="dams0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "dams0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def dams0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="dams0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "dams0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def dams0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="dams0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "dams0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def dams0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="dams0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "dams0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def dams0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "dams0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="dams0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="dams0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_dams0(): - containers = dams0_getImage() - harvest = dams0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = dams0_missingreport_s3(start=harvest) - report_idstat = dams0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = dams0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="dams0") - load_release = dams0_naburelease(start=harvest) - load_uploadrelease = dams0_uploadrelease(start=load_release) - - load_prune = dams0_nabu_prune(start=load_uploadrelease) - load_prov = dams0_nabuprov(start=load_prune) - load_org = dams0_nabuorg(start=load_prov) - -# run after load - report_msgraph=dams0_missingreport_graph(start=load_org) - report_graph=dams0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_dams1.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_dams1.py deleted file mode 100644 index be7d46d1..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_dams1.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def dams1_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def dams1_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "dams1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def dams1_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "dams1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def dams1_nabuprov(context): - returned_value = gleanerio(context,("prov"), "dams1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def dams1_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "dams1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def dams1_naburelease(context): - returned_value = gleanerio(context,("release"), "dams1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def dams1_uploadrelease(context): - returned_value = postRelease("dams1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def dams1_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="dams1") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "dams1" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def dams1_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="dams1") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "dams1" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def dams1_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="dams1") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "dams1" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def dams1_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="dams1") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "dams1" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def dams1_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "dams1" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="dams1"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="dams1" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_dams1(): - containers = dams1_getImage() - harvest = dams1_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = dams1_missingreport_s3(start=harvest) - report_idstat = dams1_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = dams1_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="dams1") - load_release = dams1_naburelease(start=harvest) - load_uploadrelease = dams1_uploadrelease(start=load_release) - - load_prune = dams1_nabu_prune(start=load_uploadrelease) - load_prov = dams1_nabuprov(start=load_prune) - load_org = dams1_nabuorg(start=load_prov) - -# run after load - report_msgraph=dams1_missingreport_graph(start=load_org) - report_graph=dams1_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_damspids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_damspids0.py deleted file mode 100644 index cab9e83c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_damspids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def damspids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def damspids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "damspids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def damspids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "damspids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def damspids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "damspids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def damspids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "damspids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def damspids0_naburelease(context): - returned_value = gleanerio(context,("release"), "damspids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def damspids0_uploadrelease(context): - returned_value = postRelease("damspids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def damspids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="damspids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "damspids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def damspids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="damspids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "damspids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def damspids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="damspids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "damspids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def damspids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="damspids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "damspids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def damspids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "damspids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="damspids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="damspids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_damspids0(): - containers = damspids0_getImage() - harvest = damspids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = damspids0_missingreport_s3(start=harvest) - report_idstat = damspids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = damspids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="damspids0") - load_release = damspids0_naburelease(start=harvest) - load_uploadrelease = damspids0_uploadrelease(start=load_release) - - load_prune = damspids0_nabu_prune(start=load_uploadrelease) - load_prov = damspids0_nabuprov(start=load_prune) - load_org = damspids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=damspids0_missingreport_graph(start=load_org) - report_graph=damspids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_demo0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_demo0.py deleted file mode 100644 index f2e1c016..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_demo0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def demo0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def demo0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "demo0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def demo0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "demo0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def demo0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "demo0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def demo0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "demo0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def demo0_naburelease(context): - returned_value = gleanerio(context,("release"), "demo0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def demo0_uploadrelease(context): - returned_value = postRelease("demo0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def demo0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="demo0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "demo0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def demo0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="demo0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "demo0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def demo0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="demo0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "demo0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def demo0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="demo0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "demo0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def demo0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "demo0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="demo0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="demo0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_demo0(): - containers = demo0_getImage() - harvest = demo0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = demo0_missingreport_s3(start=harvest) - report_idstat = demo0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = demo0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="demo0") - load_release = demo0_naburelease(start=harvest) - load_uploadrelease = demo0_uploadrelease(start=load_release) - - load_prune = demo0_nabu_prune(start=load_uploadrelease) - load_prov = demo0_nabuprov(start=load_prune) - load_org = demo0_nabuorg(start=load_prov) - -# run after load - report_msgraph=demo0_missingreport_graph(start=load_org) - report_graph=demo0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_gfv11pois0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_gfv11pois0.py deleted file mode 100644 index 8e61d057..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_gfv11pois0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def gfv11pois0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def gfv11pois0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "gfv11pois0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def gfv11pois0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "gfv11pois0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def gfv11pois0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "gfv11pois0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def gfv11pois0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "gfv11pois0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def gfv11pois0_naburelease(context): - returned_value = gleanerio(context,("release"), "gfv11pois0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def gfv11pois0_uploadrelease(context): - returned_value = postRelease("gfv11pois0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def gfv11pois0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="gfv11pois0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "gfv11pois0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def gfv11pois0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="gfv11pois0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "gfv11pois0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def gfv11pois0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="gfv11pois0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "gfv11pois0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def gfv11pois0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="gfv11pois0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "gfv11pois0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def gfv11pois0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "gfv11pois0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="gfv11pois0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="gfv11pois0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_gfv11pois0(): - containers = gfv11pois0_getImage() - harvest = gfv11pois0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = gfv11pois0_missingreport_s3(start=harvest) - report_idstat = gfv11pois0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = gfv11pois0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="gfv11pois0") - load_release = gfv11pois0_naburelease(start=harvest) - load_uploadrelease = gfv11pois0_uploadrelease(start=load_release) - - load_prune = gfv11pois0_nabu_prune(start=load_uploadrelease) - load_prov = gfv11pois0_nabuprov(start=load_prune) - load_org = gfv11pois0_nabuorg(start=load_prov) - -# run after load - report_msgraph=gfv11pois0_missingreport_graph(start=load_org) - report_graph=gfv11pois0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_gfv11pois1.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_gfv11pois1.py deleted file mode 100644 index 74d86fb5..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_gfv11pois1.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def gfv11pois1_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def gfv11pois1_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "gfv11pois1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def gfv11pois1_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "gfv11pois1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def gfv11pois1_nabuprov(context): - returned_value = gleanerio(context,("prov"), "gfv11pois1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def gfv11pois1_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "gfv11pois1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def gfv11pois1_naburelease(context): - returned_value = gleanerio(context,("release"), "gfv11pois1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def gfv11pois1_uploadrelease(context): - returned_value = postRelease("gfv11pois1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def gfv11pois1_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="gfv11pois1") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "gfv11pois1" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def gfv11pois1_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="gfv11pois1") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "gfv11pois1" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def gfv11pois1_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="gfv11pois1") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "gfv11pois1" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def gfv11pois1_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="gfv11pois1") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "gfv11pois1" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def gfv11pois1_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "gfv11pois1" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="gfv11pois1"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="gfv11pois1" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_gfv11pois1(): - containers = gfv11pois1_getImage() - harvest = gfv11pois1_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = gfv11pois1_missingreport_s3(start=harvest) - report_idstat = gfv11pois1_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = gfv11pois1_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="gfv11pois1") - load_release = gfv11pois1_naburelease(start=harvest) - load_uploadrelease = gfv11pois1_uploadrelease(start=load_release) - - load_prune = gfv11pois1_nabu_prune(start=load_uploadrelease) - load_prov = gfv11pois1_nabuprov(start=load_prune) - load_org = gfv11pois1_nabuorg(start=load_prov) - -# run after load - report_msgraph=gfv11pois1_missingreport_graph(start=load_org) - report_graph=gfv11pois1_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hmw0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hmw0.py deleted file mode 100644 index b32e4fa7..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hmw0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def hmw0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def hmw0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "hmw0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hmw0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "hmw0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hmw0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "hmw0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hmw0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "hmw0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hmw0_naburelease(context): - returned_value = gleanerio(context,("release"), "hmw0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hmw0_uploadrelease(context): - returned_value = postRelease("hmw0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def hmw0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hmw0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hmw0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hmw0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hmw0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hmw0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hmw0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hmw0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hmw0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hmw0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hmw0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hmw0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hmw0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hmw0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="hmw0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="hmw0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_hmw0(): - containers = hmw0_getImage() - harvest = hmw0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = hmw0_missingreport_s3(start=harvest) - report_idstat = hmw0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = hmw0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="hmw0") - load_release = hmw0_naburelease(start=harvest) - load_uploadrelease = hmw0_uploadrelease(start=load_release) - - load_prune = hmw0_nabu_prune(start=load_uploadrelease) - load_prov = hmw0_nabuprov(start=load_prune) - load_org = hmw0_nabuorg(start=load_prov) - -# run after load - report_msgraph=hmw0_missingreport_graph(start=load_org) - report_graph=hmw0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hmw1.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hmw1.py deleted file mode 100644 index 1511308d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hmw1.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def hmw1_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def hmw1_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "hmw1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hmw1_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "hmw1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hmw1_nabuprov(context): - returned_value = gleanerio(context,("prov"), "hmw1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hmw1_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "hmw1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hmw1_naburelease(context): - returned_value = gleanerio(context,("release"), "hmw1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hmw1_uploadrelease(context): - returned_value = postRelease("hmw1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def hmw1_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hmw1") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hmw1" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hmw1_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hmw1") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hmw1" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hmw1_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hmw1") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hmw1" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hmw1_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hmw1") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hmw1" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hmw1_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hmw1" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="hmw1"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="hmw1" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_hmw1(): - containers = hmw1_getImage() - harvest = hmw1_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = hmw1_missingreport_s3(start=harvest) - report_idstat = hmw1_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = hmw1_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="hmw1") - load_release = hmw1_naburelease(start=harvest) - load_uploadrelease = hmw1_uploadrelease(start=load_release) - - load_prune = hmw1_nabu_prune(start=load_uploadrelease) - load_prov = hmw1_nabuprov(start=load_prune) - load_org = hmw1_nabuorg(start=load_prov) - -# run after load - report_msgraph=hmw1_missingreport_graph(start=load_org) - report_graph=hmw1_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu020.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu020.py deleted file mode 100644 index eccccfb8..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu020.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def hu020_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def hu020_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "hu020") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu020_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "hu020") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu020_nabuprov(context): - returned_value = gleanerio(context,("prov"), "hu020") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu020_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "hu020") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu020_naburelease(context): - returned_value = gleanerio(context,("release"), "hu020") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hu020_uploadrelease(context): - returned_value = postRelease("hu020") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def hu020_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu020") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu020" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hu020_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu020") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu020" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hu020_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu020") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu020" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu020_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu020") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu020" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu020_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu020" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="hu020"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="hu020" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_hu020(): - containers = hu020_getImage() - harvest = hu020_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = hu020_missingreport_s3(start=harvest) - report_idstat = hu020_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = hu020_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="hu020") - load_release = hu020_naburelease(start=harvest) - load_uploadrelease = hu020_uploadrelease(start=load_release) - - load_prune = hu020_nabu_prune(start=load_uploadrelease) - load_prov = hu020_nabuprov(start=load_prune) - load_org = hu020_nabuorg(start=load_prov) - -# run after load - report_msgraph=hu020_missingreport_graph(start=load_org) - report_graph=hu020_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu040.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu040.py deleted file mode 100644 index 6173a826..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu040.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def hu040_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def hu040_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "hu040") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu040_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "hu040") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu040_nabuprov(context): - returned_value = gleanerio(context,("prov"), "hu040") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu040_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "hu040") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu040_naburelease(context): - returned_value = gleanerio(context,("release"), "hu040") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hu040_uploadrelease(context): - returned_value = postRelease("hu040") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def hu040_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu040") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu040" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hu040_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu040") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu040" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hu040_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu040") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu040" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu040_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu040") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu040" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu040_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu040" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="hu040"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="hu040" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_hu040(): - containers = hu040_getImage() - harvest = hu040_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = hu040_missingreport_s3(start=harvest) - report_idstat = hu040_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = hu040_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="hu040") - load_release = hu040_naburelease(start=harvest) - load_uploadrelease = hu040_uploadrelease(start=load_release) - - load_prune = hu040_nabu_prune(start=load_uploadrelease) - load_prov = hu040_nabuprov(start=load_prune) - load_org = hu040_nabuorg(start=load_prov) - -# run after load - report_msgraph=hu040_missingreport_graph(start=load_org) - report_graph=hu040_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu060.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu060.py deleted file mode 100644 index 97e5300c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu060.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def hu060_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def hu060_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "hu060") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu060_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "hu060") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu060_nabuprov(context): - returned_value = gleanerio(context,("prov"), "hu060") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu060_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "hu060") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu060_naburelease(context): - returned_value = gleanerio(context,("release"), "hu060") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hu060_uploadrelease(context): - returned_value = postRelease("hu060") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def hu060_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu060") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu060" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hu060_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu060") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu060" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hu060_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu060") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu060" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu060_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu060") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu060" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu060_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu060" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="hu060"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="hu060" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_hu060(): - containers = hu060_getImage() - harvest = hu060_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = hu060_missingreport_s3(start=harvest) - report_idstat = hu060_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = hu060_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="hu060") - load_release = hu060_naburelease(start=harvest) - load_uploadrelease = hu060_uploadrelease(start=load_release) - - load_prune = hu060_nabu_prune(start=load_uploadrelease) - load_prov = hu060_nabuprov(start=load_prune) - load_org = hu060_nabuorg(start=load_prov) - -# run after load - report_msgraph=hu060_missingreport_graph(start=load_org) - report_graph=hu060_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu080.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu080.py deleted file mode 100644 index 6b84e353..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu080.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def hu080_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def hu080_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "hu080") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu080_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "hu080") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu080_nabuprov(context): - returned_value = gleanerio(context,("prov"), "hu080") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu080_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "hu080") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu080_naburelease(context): - returned_value = gleanerio(context,("release"), "hu080") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hu080_uploadrelease(context): - returned_value = postRelease("hu080") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def hu080_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu080") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu080" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hu080_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu080") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu080" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hu080_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu080") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu080" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu080_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu080") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu080" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu080_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu080" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="hu080"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="hu080" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_hu080(): - containers = hu080_getImage() - harvest = hu080_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = hu080_missingreport_s3(start=harvest) - report_idstat = hu080_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = hu080_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="hu080") - load_release = hu080_naburelease(start=harvest) - load_uploadrelease = hu080_uploadrelease(start=load_release) - - load_prune = hu080_nabu_prune(start=load_uploadrelease) - load_prov = hu080_nabuprov(start=load_prune) - load_org = hu080_nabuorg(start=load_prov) - -# run after load - report_msgraph=hu080_missingreport_graph(start=load_org) - report_graph=hu080_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu100.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu100.py deleted file mode 100644 index 376573aa..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hu100.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def hu100_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def hu100_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "hu100") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu100_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "hu100") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu100_nabuprov(context): - returned_value = gleanerio(context,("prov"), "hu100") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu100_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "hu100") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu100_naburelease(context): - returned_value = gleanerio(context,("release"), "hu100") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hu100_uploadrelease(context): - returned_value = postRelease("hu100") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def hu100_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu100") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu100" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hu100_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu100") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu100" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hu100_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu100") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu100" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu100_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hu100") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu100" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hu100_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hu100" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="hu100"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="hu100" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_hu100(): - containers = hu100_getImage() - harvest = hu100_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = hu100_missingreport_s3(start=harvest) - report_idstat = hu100_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = hu100_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="hu100") - load_release = hu100_naburelease(start=harvest) - load_uploadrelease = hu100_uploadrelease(start=load_release) - - load_prune = hu100_nabu_prune(start=load_uploadrelease) - load_prov = hu100_nabuprov(start=load_prune) - load_org = hu100_nabuorg(start=load_prov) - -# run after load - report_msgraph=hu100_missingreport_graph(start=load_org) - report_graph=hu100_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_huc12pp0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_huc12pp0.py deleted file mode 100644 index 17204a23..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_huc12pp0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def huc12pp0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def huc12pp0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "huc12pp0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def huc12pp0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "huc12pp0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def huc12pp0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "huc12pp0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def huc12pp0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "huc12pp0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def huc12pp0_naburelease(context): - returned_value = gleanerio(context,("release"), "huc12pp0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def huc12pp0_uploadrelease(context): - returned_value = postRelease("huc12pp0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def huc12pp0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="huc12pp0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "huc12pp0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def huc12pp0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="huc12pp0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "huc12pp0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def huc12pp0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="huc12pp0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "huc12pp0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def huc12pp0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="huc12pp0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "huc12pp0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def huc12pp0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "huc12pp0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="huc12pp0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="huc12pp0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_huc12pp0(): - containers = huc12pp0_getImage() - harvest = huc12pp0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = huc12pp0_missingreport_s3(start=harvest) - report_idstat = huc12pp0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = huc12pp0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="huc12pp0") - load_release = huc12pp0_naburelease(start=harvest) - load_uploadrelease = huc12pp0_uploadrelease(start=load_release) - - load_prune = huc12pp0_nabu_prune(start=load_uploadrelease) - load_prov = huc12pp0_nabuprov(start=load_prune) - load_org = huc12pp0_nabuorg(start=load_prov) - -# run after load - report_msgraph=huc12pp0_missingreport_graph(start=load_org) - report_graph=huc12pp0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_huc12pp1.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_huc12pp1.py deleted file mode 100644 index c2b1b731..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_huc12pp1.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def huc12pp1_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def huc12pp1_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "huc12pp1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def huc12pp1_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "huc12pp1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def huc12pp1_nabuprov(context): - returned_value = gleanerio(context,("prov"), "huc12pp1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def huc12pp1_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "huc12pp1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def huc12pp1_naburelease(context): - returned_value = gleanerio(context,("release"), "huc12pp1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def huc12pp1_uploadrelease(context): - returned_value = postRelease("huc12pp1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def huc12pp1_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="huc12pp1") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "huc12pp1" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def huc12pp1_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="huc12pp1") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "huc12pp1" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def huc12pp1_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="huc12pp1") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "huc12pp1" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def huc12pp1_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="huc12pp1") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "huc12pp1" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def huc12pp1_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "huc12pp1" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="huc12pp1"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="huc12pp1" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_huc12pp1(): - containers = huc12pp1_getImage() - harvest = huc12pp1_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = huc12pp1_missingreport_s3(start=harvest) - report_idstat = huc12pp1_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = huc12pp1_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="huc12pp1") - load_release = huc12pp1_naburelease(start=harvest) - load_uploadrelease = huc12pp1_uploadrelease(start=load_release) - - load_prune = huc12pp1_nabu_prune(start=load_uploadrelease) - load_prov = huc12pp1_nabuprov(start=load_prune) - load_org = huc12pp1_nabuorg(start=load_prov) - -# run after load - report_msgraph=huc12pp1_missingreport_graph(start=load_org) - report_graph=huc12pp1_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hydrologicunit0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hydrologicunit0.py deleted file mode 100644 index 7117cbe5..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_hydrologicunit0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def hydrologicunit0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def hydrologicunit0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "hydrologicunit0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hydrologicunit0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "hydrologicunit0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hydrologicunit0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "hydrologicunit0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hydrologicunit0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "hydrologicunit0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hydrologicunit0_naburelease(context): - returned_value = gleanerio(context,("release"), "hydrologicunit0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hydrologicunit0_uploadrelease(context): - returned_value = postRelease("hydrologicunit0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def hydrologicunit0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hydrologicunit0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hydrologicunit0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hydrologicunit0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hydrologicunit0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hydrologicunit0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def hydrologicunit0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hydrologicunit0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hydrologicunit0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hydrologicunit0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="hydrologicunit0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hydrologicunit0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def hydrologicunit0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "hydrologicunit0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="hydrologicunit0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="hydrologicunit0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_hydrologicunit0(): - containers = hydrologicunit0_getImage() - harvest = hydrologicunit0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = hydrologicunit0_missingreport_s3(start=harvest) - report_idstat = hydrologicunit0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = hydrologicunit0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="hydrologicunit0") - load_release = hydrologicunit0_naburelease(start=harvest) - load_uploadrelease = hydrologicunit0_uploadrelease(start=load_release) - - load_prune = hydrologicunit0_nabu_prune(start=load_uploadrelease) - load_prov = hydrologicunit0_nabuprov(start=load_prune) - load_org = hydrologicunit0_nabuorg(start=load_prov) - -# run after load - report_msgraph=hydrologicunit0_missingreport_graph(start=load_org) - report_graph=hydrologicunit0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_links0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_links0.py deleted file mode 100644 index 3faabe65..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_links0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def links0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def links0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "links0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def links0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "links0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def links0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "links0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def links0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "links0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def links0_naburelease(context): - returned_value = gleanerio(context,("release"), "links0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def links0_uploadrelease(context): - returned_value = postRelease("links0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def links0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="links0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "links0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def links0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="links0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "links0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def links0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="links0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "links0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def links0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="links0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "links0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def links0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "links0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="links0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="links0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_links0(): - containers = links0_getImage() - harvest = links0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = links0_missingreport_s3(start=harvest) - report_idstat = links0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = links0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="links0") - load_release = links0_naburelease(start=harvest) - load_uploadrelease = links0_uploadrelease(start=load_release) - - load_prune = links0_nabu_prune(start=load_uploadrelease) - load_prov = links0_nabuprov(start=load_prune) - load_org = links0_nabuorg(start=load_prov) - -# run after load - report_msgraph=links0_missingreport_graph(start=load_org) - report_graph=links0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_mainstems0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_mainstems0.py deleted file mode 100644 index cca2df36..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_mainstems0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def mainstems0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def mainstems0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "mainstems0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def mainstems0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "mainstems0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def mainstems0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "mainstems0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def mainstems0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "mainstems0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def mainstems0_naburelease(context): - returned_value = gleanerio(context,("release"), "mainstems0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def mainstems0_uploadrelease(context): - returned_value = postRelease("mainstems0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def mainstems0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="mainstems0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "mainstems0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def mainstems0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="mainstems0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "mainstems0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def mainstems0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="mainstems0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "mainstems0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def mainstems0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="mainstems0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "mainstems0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def mainstems0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "mainstems0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="mainstems0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="mainstems0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_mainstems0(): - containers = mainstems0_getImage() - harvest = mainstems0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = mainstems0_missingreport_s3(start=harvest) - report_idstat = mainstems0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = mainstems0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="mainstems0") - load_release = mainstems0_naburelease(start=harvest) - load_uploadrelease = mainstems0_uploadrelease(start=load_release) - - load_prune = mainstems0_nabu_prune(start=load_uploadrelease) - load_prov = mainstems0_nabuprov(start=load_prune) - load_org = mainstems0_nabuorg(start=load_prov) - -# run after load - report_msgraph=mainstems0_missingreport_graph(start=load_org) - report_graph=mainstems0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nataq0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nataq0.py deleted file mode 100644 index 4eaac896..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nataq0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nataq0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nataq0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nataq0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nataq0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nataq0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nataq0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nataq0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nataq0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nataq0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nataq0_naburelease(context): - returned_value = gleanerio(context,("release"), "nataq0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nataq0_uploadrelease(context): - returned_value = postRelease("nataq0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nataq0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nataq0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nataq0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nataq0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nataq0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nataq0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nataq0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nataq0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nataq0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nataq0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nataq0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nataq0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nataq0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nataq0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nataq0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nataq0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nataq0(): - containers = nataq0_getImage() - harvest = nataq0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nataq0_missingreport_s3(start=harvest) - report_idstat = nataq0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nataq0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nataq0") - load_release = nataq0_naburelease(start=harvest) - load_uploadrelease = nataq0_uploadrelease(start=load_release) - - load_prune = nataq0_nabu_prune(start=load_uploadrelease) - load_prov = nataq0_nabuprov(start=load_prune) - load_org = nataq0_nabuorg(start=load_prov) - -# run after load - report_msgraph=nataq0_missingreport_graph(start=load_org) - report_graph=nataq0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose0.py deleted file mode 100644 index 7d309ca8..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nmwdiose0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nmwdiose0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nmwdiose0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nmwdiose0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nmwdiose0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nmwdiose0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose0_naburelease(context): - returned_value = gleanerio(context,("release"), "nmwdiose0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nmwdiose0_uploadrelease(context): - returned_value = postRelease("nmwdiose0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nmwdiose0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nmwdiose0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nmwdiose0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nmwdiose0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nmwdiose0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nmwdiose0(): - containers = nmwdiose0_getImage() - harvest = nmwdiose0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nmwdiose0_missingreport_s3(start=harvest) - report_idstat = nmwdiose0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nmwdiose0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nmwdiose0") - load_release = nmwdiose0_naburelease(start=harvest) - load_uploadrelease = nmwdiose0_uploadrelease(start=load_release) - - load_prune = nmwdiose0_nabu_prune(start=load_uploadrelease) - load_prov = nmwdiose0_nabuprov(start=load_prune) - load_org = nmwdiose0_nabuorg(start=load_prov) - -# run after load - report_msgraph=nmwdiose0_missingreport_graph(start=load_org) - report_graph=nmwdiose0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose1.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose1.py deleted file mode 100644 index 8c16bf16..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose1.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nmwdiose1_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nmwdiose1_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nmwdiose1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose1_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nmwdiose1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose1_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nmwdiose1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose1_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nmwdiose1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose1_naburelease(context): - returned_value = gleanerio(context,("release"), "nmwdiose1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nmwdiose1_uploadrelease(context): - returned_value = postRelease("nmwdiose1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nmwdiose1_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose1") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose1" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nmwdiose1_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose1") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose1" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nmwdiose1_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose1") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose1" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose1_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose1") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose1" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose1_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose1" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nmwdiose1"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nmwdiose1" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nmwdiose1(): - containers = nmwdiose1_getImage() - harvest = nmwdiose1_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nmwdiose1_missingreport_s3(start=harvest) - report_idstat = nmwdiose1_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nmwdiose1_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nmwdiose1") - load_release = nmwdiose1_naburelease(start=harvest) - load_uploadrelease = nmwdiose1_uploadrelease(start=load_release) - - load_prune = nmwdiose1_nabu_prune(start=load_uploadrelease) - load_prov = nmwdiose1_nabuprov(start=load_prune) - load_org = nmwdiose1_nabuorg(start=load_prov) - -# run after load - report_msgraph=nmwdiose1_missingreport_graph(start=load_org) - report_graph=nmwdiose1_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose2.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose2.py deleted file mode 100644 index 04b63cab..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose2.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nmwdiose2_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nmwdiose2_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nmwdiose2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose2_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nmwdiose2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose2_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nmwdiose2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose2_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nmwdiose2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose2_naburelease(context): - returned_value = gleanerio(context,("release"), "nmwdiose2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nmwdiose2_uploadrelease(context): - returned_value = postRelease("nmwdiose2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nmwdiose2_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose2") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose2" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nmwdiose2_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose2") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose2" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nmwdiose2_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose2") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose2" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose2_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose2") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose2" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose2_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose2" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nmwdiose2"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nmwdiose2" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nmwdiose2(): - containers = nmwdiose2_getImage() - harvest = nmwdiose2_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nmwdiose2_missingreport_s3(start=harvest) - report_idstat = nmwdiose2_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nmwdiose2_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nmwdiose2") - load_release = nmwdiose2_naburelease(start=harvest) - load_uploadrelease = nmwdiose2_uploadrelease(start=load_release) - - load_prune = nmwdiose2_nabu_prune(start=load_uploadrelease) - load_prov = nmwdiose2_nabuprov(start=load_prune) - load_org = nmwdiose2_nabuorg(start=load_prov) - -# run after load - report_msgraph=nmwdiose2_missingreport_graph(start=load_org) - report_graph=nmwdiose2_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose3.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose3.py deleted file mode 100644 index 5995e358..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose3.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nmwdiose3_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nmwdiose3_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nmwdiose3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose3_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nmwdiose3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose3_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nmwdiose3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose3_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nmwdiose3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose3_naburelease(context): - returned_value = gleanerio(context,("release"), "nmwdiose3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nmwdiose3_uploadrelease(context): - returned_value = postRelease("nmwdiose3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nmwdiose3_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose3") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose3" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nmwdiose3_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose3") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose3" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nmwdiose3_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose3") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose3" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose3_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose3") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose3" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose3_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose3" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nmwdiose3"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nmwdiose3" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nmwdiose3(): - containers = nmwdiose3_getImage() - harvest = nmwdiose3_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nmwdiose3_missingreport_s3(start=harvest) - report_idstat = nmwdiose3_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nmwdiose3_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nmwdiose3") - load_release = nmwdiose3_naburelease(start=harvest) - load_uploadrelease = nmwdiose3_uploadrelease(start=load_release) - - load_prune = nmwdiose3_nabu_prune(start=load_uploadrelease) - load_prov = nmwdiose3_nabuprov(start=load_prune) - load_org = nmwdiose3_nabuorg(start=load_prov) - -# run after load - report_msgraph=nmwdiose3_missingreport_graph(start=load_org) - report_graph=nmwdiose3_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose4.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose4.py deleted file mode 100644 index 0e12cc23..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdiose4.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nmwdiose4_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nmwdiose4_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nmwdiose4") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose4_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nmwdiose4") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose4_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nmwdiose4") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose4_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nmwdiose4") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose4_naburelease(context): - returned_value = gleanerio(context,("release"), "nmwdiose4") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nmwdiose4_uploadrelease(context): - returned_value = postRelease("nmwdiose4") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nmwdiose4_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose4") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose4" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nmwdiose4_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose4") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose4" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nmwdiose4_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose4") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose4" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose4_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdiose4") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose4" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdiose4_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdiose4" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nmwdiose4"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nmwdiose4" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nmwdiose4(): - containers = nmwdiose4_getImage() - harvest = nmwdiose4_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nmwdiose4_missingreport_s3(start=harvest) - report_idstat = nmwdiose4_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nmwdiose4_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nmwdiose4") - load_release = nmwdiose4_naburelease(start=harvest) - load_uploadrelease = nmwdiose4_uploadrelease(start=load_release) - - load_prune = nmwdiose4_nabu_prune(start=load_uploadrelease) - load_prov = nmwdiose4_nabuprov(start=load_prune) - load_org = nmwdiose4_nabuorg(start=load_prov) - -# run after load - report_msgraph=nmwdiose4_missingreport_graph(start=load_org) - report_graph=nmwdiose4_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdist0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdist0.py deleted file mode 100644 index 63865abd..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nmwdist0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nmwdist0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nmwdist0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nmwdist0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdist0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nmwdist0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdist0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nmwdist0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdist0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nmwdist0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdist0_naburelease(context): - returned_value = gleanerio(context,("release"), "nmwdist0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nmwdist0_uploadrelease(context): - returned_value = postRelease("nmwdist0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nmwdist0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdist0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdist0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nmwdist0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdist0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdist0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nmwdist0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdist0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdist0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdist0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nmwdist0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdist0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nmwdist0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nmwdist0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nmwdist0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nmwdist0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nmwdist0(): - containers = nmwdist0_getImage() - harvest = nmwdist0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nmwdist0_missingreport_s3(start=harvest) - report_idstat = nmwdist0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nmwdist0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nmwdist0") - load_release = nmwdist0_naburelease(start=harvest) - load_uploadrelease = nmwdist0_uploadrelease(start=load_release) - - load_prune = nmwdist0_nabu_prune(start=load_uploadrelease) - load_prov = nmwdist0_nabuprov(start=load_prune) - load_org = nmwdist0_nabuorg(start=load_prov) - -# run after load - report_msgraph=nmwdist0_missingreport_graph(start=load_org) - report_graph=nmwdist0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw0.py deleted file mode 100644 index 80663d14..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw0_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw0_uploadrelease(context): - returned_value = postRelease("nwisgw0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw0(): - containers = nwisgw0_getImage() - harvest = nwisgw0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw0_missingreport_s3(start=harvest) - report_idstat = nwisgw0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw0") - load_release = nwisgw0_naburelease(start=harvest) - load_uploadrelease = nwisgw0_uploadrelease(start=load_release) - - load_prune = nwisgw0_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw0_nabuprov(start=load_prune) - load_org = nwisgw0_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw0_missingreport_graph(start=load_org) - report_graph=nwisgw0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw1.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw1.py deleted file mode 100644 index 7fd2ba8d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw1.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw1_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw1_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw1_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw1_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw1_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw1_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw1_uploadrelease(context): - returned_value = postRelease("nwisgw1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw1_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw1") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw1" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw1_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw1") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw1" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw1_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw1") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw1" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw1_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw1") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw1" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw1_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw1" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw1"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw1" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw1(): - containers = nwisgw1_getImage() - harvest = nwisgw1_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw1_missingreport_s3(start=harvest) - report_idstat = nwisgw1_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw1_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw1") - load_release = nwisgw1_naburelease(start=harvest) - load_uploadrelease = nwisgw1_uploadrelease(start=load_release) - - load_prune = nwisgw1_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw1_nabuprov(start=load_prune) - load_org = nwisgw1_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw1_missingreport_graph(start=load_org) - report_graph=nwisgw1_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw10.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw10.py deleted file mode 100644 index b1247e3e..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw10.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw10_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw10_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw10") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw10_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw10") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw10_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw10") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw10_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw10") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw10_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw10") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw10_uploadrelease(context): - returned_value = postRelease("nwisgw10") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw10_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw10") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw10" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw10_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw10") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw10" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw10_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw10") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw10" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw10_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw10") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw10" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw10_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw10" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw10"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw10" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw10(): - containers = nwisgw10_getImage() - harvest = nwisgw10_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw10_missingreport_s3(start=harvest) - report_idstat = nwisgw10_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw10_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw10") - load_release = nwisgw10_naburelease(start=harvest) - load_uploadrelease = nwisgw10_uploadrelease(start=load_release) - - load_prune = nwisgw10_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw10_nabuprov(start=load_prune) - load_org = nwisgw10_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw10_missingreport_graph(start=load_org) - report_graph=nwisgw10_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw11.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw11.py deleted file mode 100644 index 8cc75edb..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw11.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw11_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw11_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw11") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw11_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw11") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw11_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw11") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw11_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw11") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw11_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw11") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw11_uploadrelease(context): - returned_value = postRelease("nwisgw11") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw11_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw11") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw11" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw11_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw11") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw11" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw11_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw11") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw11" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw11_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw11") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw11" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw11_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw11" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw11"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw11" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw11(): - containers = nwisgw11_getImage() - harvest = nwisgw11_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw11_missingreport_s3(start=harvest) - report_idstat = nwisgw11_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw11_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw11") - load_release = nwisgw11_naburelease(start=harvest) - load_uploadrelease = nwisgw11_uploadrelease(start=load_release) - - load_prune = nwisgw11_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw11_nabuprov(start=load_prune) - load_org = nwisgw11_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw11_missingreport_graph(start=load_org) - report_graph=nwisgw11_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw12.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw12.py deleted file mode 100644 index c242c2f6..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw12.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw12_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw12_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw12") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw12_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw12") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw12_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw12") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw12_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw12") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw12_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw12") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw12_uploadrelease(context): - returned_value = postRelease("nwisgw12") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw12_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw12") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw12" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw12_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw12") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw12" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw12_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw12") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw12" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw12_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw12") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw12" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw12_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw12" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw12"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw12" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw12(): - containers = nwisgw12_getImage() - harvest = nwisgw12_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw12_missingreport_s3(start=harvest) - report_idstat = nwisgw12_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw12_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw12") - load_release = nwisgw12_naburelease(start=harvest) - load_uploadrelease = nwisgw12_uploadrelease(start=load_release) - - load_prune = nwisgw12_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw12_nabuprov(start=load_prune) - load_org = nwisgw12_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw12_missingreport_graph(start=load_org) - report_graph=nwisgw12_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw13.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw13.py deleted file mode 100644 index 20c3418d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw13.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw13_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw13_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw13") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw13_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw13") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw13_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw13") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw13_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw13") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw13_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw13") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw13_uploadrelease(context): - returned_value = postRelease("nwisgw13") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw13_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw13") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw13" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw13_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw13") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw13" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw13_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw13") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw13" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw13_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw13") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw13" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw13_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw13" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw13"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw13" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw13(): - containers = nwisgw13_getImage() - harvest = nwisgw13_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw13_missingreport_s3(start=harvest) - report_idstat = nwisgw13_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw13_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw13") - load_release = nwisgw13_naburelease(start=harvest) - load_uploadrelease = nwisgw13_uploadrelease(start=load_release) - - load_prune = nwisgw13_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw13_nabuprov(start=load_prune) - load_org = nwisgw13_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw13_missingreport_graph(start=load_org) - report_graph=nwisgw13_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw14.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw14.py deleted file mode 100644 index ccb6e998..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw14.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw14_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw14_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw14") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw14_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw14") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw14_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw14") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw14_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw14") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw14_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw14") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw14_uploadrelease(context): - returned_value = postRelease("nwisgw14") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw14_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw14") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw14" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw14_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw14") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw14" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw14_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw14") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw14" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw14_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw14") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw14" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw14_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw14" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw14"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw14" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw14(): - containers = nwisgw14_getImage() - harvest = nwisgw14_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw14_missingreport_s3(start=harvest) - report_idstat = nwisgw14_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw14_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw14") - load_release = nwisgw14_naburelease(start=harvest) - load_uploadrelease = nwisgw14_uploadrelease(start=load_release) - - load_prune = nwisgw14_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw14_nabuprov(start=load_prune) - load_org = nwisgw14_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw14_missingreport_graph(start=load_org) - report_graph=nwisgw14_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw15.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw15.py deleted file mode 100644 index 65e7e3ca..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw15.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw15_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw15_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw15") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw15_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw15") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw15_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw15") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw15_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw15") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw15_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw15") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw15_uploadrelease(context): - returned_value = postRelease("nwisgw15") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw15_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw15") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw15" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw15_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw15") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw15" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw15_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw15") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw15" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw15_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw15") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw15" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw15_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw15" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw15"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw15" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw15(): - containers = nwisgw15_getImage() - harvest = nwisgw15_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw15_missingreport_s3(start=harvest) - report_idstat = nwisgw15_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw15_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw15") - load_release = nwisgw15_naburelease(start=harvest) - load_uploadrelease = nwisgw15_uploadrelease(start=load_release) - - load_prune = nwisgw15_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw15_nabuprov(start=load_prune) - load_org = nwisgw15_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw15_missingreport_graph(start=load_org) - report_graph=nwisgw15_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw16.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw16.py deleted file mode 100644 index 6b794bbd..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw16.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw16_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw16_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw16") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw16_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw16") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw16_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw16") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw16_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw16") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw16_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw16") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw16_uploadrelease(context): - returned_value = postRelease("nwisgw16") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw16_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw16") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw16" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw16_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw16") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw16" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw16_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw16") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw16" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw16_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw16") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw16" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw16_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw16" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw16"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw16" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw16(): - containers = nwisgw16_getImage() - harvest = nwisgw16_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw16_missingreport_s3(start=harvest) - report_idstat = nwisgw16_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw16_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw16") - load_release = nwisgw16_naburelease(start=harvest) - load_uploadrelease = nwisgw16_uploadrelease(start=load_release) - - load_prune = nwisgw16_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw16_nabuprov(start=load_prune) - load_org = nwisgw16_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw16_missingreport_graph(start=load_org) - report_graph=nwisgw16_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw17.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw17.py deleted file mode 100644 index 059c0d0b..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw17.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw17_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw17_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw17") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw17_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw17") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw17_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw17") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw17_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw17") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw17_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw17") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw17_uploadrelease(context): - returned_value = postRelease("nwisgw17") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw17_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw17") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw17" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw17_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw17") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw17" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw17_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw17") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw17" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw17_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw17") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw17" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw17_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw17" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw17"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw17" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw17(): - containers = nwisgw17_getImage() - harvest = nwisgw17_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw17_missingreport_s3(start=harvest) - report_idstat = nwisgw17_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw17_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw17") - load_release = nwisgw17_naburelease(start=harvest) - load_uploadrelease = nwisgw17_uploadrelease(start=load_release) - - load_prune = nwisgw17_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw17_nabuprov(start=load_prune) - load_org = nwisgw17_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw17_missingreport_graph(start=load_org) - report_graph=nwisgw17_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw18.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw18.py deleted file mode 100644 index f45c6048..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw18.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw18_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw18_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw18") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw18_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw18") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw18_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw18") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw18_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw18") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw18_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw18") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw18_uploadrelease(context): - returned_value = postRelease("nwisgw18") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw18_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw18") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw18" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw18_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw18") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw18" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw18_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw18") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw18" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw18_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw18") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw18" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw18_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw18" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw18"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw18" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw18(): - containers = nwisgw18_getImage() - harvest = nwisgw18_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw18_missingreport_s3(start=harvest) - report_idstat = nwisgw18_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw18_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw18") - load_release = nwisgw18_naburelease(start=harvest) - load_uploadrelease = nwisgw18_uploadrelease(start=load_release) - - load_prune = nwisgw18_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw18_nabuprov(start=load_prune) - load_org = nwisgw18_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw18_missingreport_graph(start=load_org) - report_graph=nwisgw18_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw19.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw19.py deleted file mode 100644 index 5c2c5e25..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw19.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw19_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw19_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw19") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw19_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw19") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw19_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw19") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw19_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw19") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw19_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw19") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw19_uploadrelease(context): - returned_value = postRelease("nwisgw19") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw19_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw19") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw19" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw19_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw19") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw19" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw19_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw19") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw19" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw19_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw19") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw19" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw19_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw19" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw19"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw19" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw19(): - containers = nwisgw19_getImage() - harvest = nwisgw19_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw19_missingreport_s3(start=harvest) - report_idstat = nwisgw19_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw19_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw19") - load_release = nwisgw19_naburelease(start=harvest) - load_uploadrelease = nwisgw19_uploadrelease(start=load_release) - - load_prune = nwisgw19_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw19_nabuprov(start=load_prune) - load_org = nwisgw19_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw19_missingreport_graph(start=load_org) - report_graph=nwisgw19_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw2.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw2.py deleted file mode 100644 index 0bb8f1d0..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw2.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw2_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw2_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw2_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw2_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw2_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw2_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw2_uploadrelease(context): - returned_value = postRelease("nwisgw2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw2_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw2") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw2" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw2_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw2") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw2" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw2_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw2") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw2" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw2_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw2") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw2" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw2_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw2" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw2"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw2" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw2(): - containers = nwisgw2_getImage() - harvest = nwisgw2_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw2_missingreport_s3(start=harvest) - report_idstat = nwisgw2_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw2_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw2") - load_release = nwisgw2_naburelease(start=harvest) - load_uploadrelease = nwisgw2_uploadrelease(start=load_release) - - load_prune = nwisgw2_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw2_nabuprov(start=load_prune) - load_org = nwisgw2_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw2_missingreport_graph(start=load_org) - report_graph=nwisgw2_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw20.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw20.py deleted file mode 100644 index e63561fe..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw20.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw20_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw20_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw20") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw20_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw20") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw20_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw20") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw20_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw20") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw20_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw20") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw20_uploadrelease(context): - returned_value = postRelease("nwisgw20") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw20_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw20") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw20" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw20_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw20") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw20" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw20_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw20") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw20" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw20_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw20") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw20" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw20_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw20" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw20"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw20" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw20(): - containers = nwisgw20_getImage() - harvest = nwisgw20_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw20_missingreport_s3(start=harvest) - report_idstat = nwisgw20_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw20_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw20") - load_release = nwisgw20_naburelease(start=harvest) - load_uploadrelease = nwisgw20_uploadrelease(start=load_release) - - load_prune = nwisgw20_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw20_nabuprov(start=load_prune) - load_org = nwisgw20_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw20_missingreport_graph(start=load_org) - report_graph=nwisgw20_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw21.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw21.py deleted file mode 100644 index 3067b9df..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw21.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw21_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw21_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw21") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw21_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw21") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw21_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw21") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw21_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw21") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw21_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw21") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw21_uploadrelease(context): - returned_value = postRelease("nwisgw21") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw21_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw21") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw21" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw21_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw21") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw21" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw21_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw21") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw21" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw21_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw21") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw21" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw21_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw21" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw21"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw21" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw21(): - containers = nwisgw21_getImage() - harvest = nwisgw21_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw21_missingreport_s3(start=harvest) - report_idstat = nwisgw21_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw21_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw21") - load_release = nwisgw21_naburelease(start=harvest) - load_uploadrelease = nwisgw21_uploadrelease(start=load_release) - - load_prune = nwisgw21_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw21_nabuprov(start=load_prune) - load_org = nwisgw21_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw21_missingreport_graph(start=load_org) - report_graph=nwisgw21_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw22.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw22.py deleted file mode 100644 index 2a868789..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw22.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw22_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw22_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw22") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw22_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw22") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw22_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw22") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw22_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw22") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw22_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw22") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw22_uploadrelease(context): - returned_value = postRelease("nwisgw22") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw22_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw22") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw22" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw22_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw22") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw22" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw22_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw22") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw22" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw22_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw22") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw22" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw22_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw22" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw22"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw22" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw22(): - containers = nwisgw22_getImage() - harvest = nwisgw22_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw22_missingreport_s3(start=harvest) - report_idstat = nwisgw22_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw22_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw22") - load_release = nwisgw22_naburelease(start=harvest) - load_uploadrelease = nwisgw22_uploadrelease(start=load_release) - - load_prune = nwisgw22_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw22_nabuprov(start=load_prune) - load_org = nwisgw22_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw22_missingreport_graph(start=load_org) - report_graph=nwisgw22_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw23.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw23.py deleted file mode 100644 index 38b2304b..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw23.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw23_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw23_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw23") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw23_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw23") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw23_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw23") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw23_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw23") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw23_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw23") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw23_uploadrelease(context): - returned_value = postRelease("nwisgw23") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw23_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw23") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw23" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw23_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw23") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw23" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw23_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw23") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw23" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw23_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw23") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw23" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw23_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw23" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw23"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw23" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw23(): - containers = nwisgw23_getImage() - harvest = nwisgw23_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw23_missingreport_s3(start=harvest) - report_idstat = nwisgw23_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw23_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw23") - load_release = nwisgw23_naburelease(start=harvest) - load_uploadrelease = nwisgw23_uploadrelease(start=load_release) - - load_prune = nwisgw23_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw23_nabuprov(start=load_prune) - load_org = nwisgw23_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw23_missingreport_graph(start=load_org) - report_graph=nwisgw23_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw24.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw24.py deleted file mode 100644 index f6728464..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw24.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw24_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw24_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw24") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw24_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw24") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw24_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw24") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw24_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw24") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw24_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw24") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw24_uploadrelease(context): - returned_value = postRelease("nwisgw24") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw24_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw24") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw24" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw24_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw24") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw24" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw24_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw24") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw24" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw24_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw24") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw24" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw24_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw24" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw24"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw24" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw24(): - containers = nwisgw24_getImage() - harvest = nwisgw24_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw24_missingreport_s3(start=harvest) - report_idstat = nwisgw24_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw24_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw24") - load_release = nwisgw24_naburelease(start=harvest) - load_uploadrelease = nwisgw24_uploadrelease(start=load_release) - - load_prune = nwisgw24_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw24_nabuprov(start=load_prune) - load_org = nwisgw24_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw24_missingreport_graph(start=load_org) - report_graph=nwisgw24_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw25.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw25.py deleted file mode 100644 index 3e7f9f02..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw25.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw25_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw25_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw25") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw25_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw25") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw25_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw25") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw25_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw25") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw25_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw25") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw25_uploadrelease(context): - returned_value = postRelease("nwisgw25") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw25_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw25") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw25" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw25_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw25") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw25" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw25_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw25") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw25" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw25_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw25") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw25" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw25_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw25" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw25"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw25" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw25(): - containers = nwisgw25_getImage() - harvest = nwisgw25_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw25_missingreport_s3(start=harvest) - report_idstat = nwisgw25_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw25_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw25") - load_release = nwisgw25_naburelease(start=harvest) - load_uploadrelease = nwisgw25_uploadrelease(start=load_release) - - load_prune = nwisgw25_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw25_nabuprov(start=load_prune) - load_org = nwisgw25_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw25_missingreport_graph(start=load_org) - report_graph=nwisgw25_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw26.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw26.py deleted file mode 100644 index 8cdab085..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw26.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw26_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw26_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw26") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw26_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw26") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw26_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw26") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw26_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw26") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw26_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw26") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw26_uploadrelease(context): - returned_value = postRelease("nwisgw26") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw26_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw26") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw26" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw26_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw26") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw26" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw26_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw26") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw26" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw26_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw26") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw26" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw26_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw26" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw26"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw26" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw26(): - containers = nwisgw26_getImage() - harvest = nwisgw26_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw26_missingreport_s3(start=harvest) - report_idstat = nwisgw26_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw26_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw26") - load_release = nwisgw26_naburelease(start=harvest) - load_uploadrelease = nwisgw26_uploadrelease(start=load_release) - - load_prune = nwisgw26_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw26_nabuprov(start=load_prune) - load_org = nwisgw26_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw26_missingreport_graph(start=load_org) - report_graph=nwisgw26_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw27.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw27.py deleted file mode 100644 index c4069a97..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw27.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw27_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw27_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw27") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw27_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw27") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw27_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw27") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw27_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw27") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw27_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw27") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw27_uploadrelease(context): - returned_value = postRelease("nwisgw27") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw27_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw27") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw27" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw27_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw27") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw27" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw27_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw27") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw27" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw27_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw27") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw27" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw27_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw27" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw27"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw27" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw27(): - containers = nwisgw27_getImage() - harvest = nwisgw27_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw27_missingreport_s3(start=harvest) - report_idstat = nwisgw27_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw27_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw27") - load_release = nwisgw27_naburelease(start=harvest) - load_uploadrelease = nwisgw27_uploadrelease(start=load_release) - - load_prune = nwisgw27_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw27_nabuprov(start=load_prune) - load_org = nwisgw27_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw27_missingreport_graph(start=load_org) - report_graph=nwisgw27_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw28.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw28.py deleted file mode 100644 index 3c5035c4..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw28.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw28_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw28_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw28") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw28_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw28") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw28_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw28") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw28_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw28") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw28_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw28") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw28_uploadrelease(context): - returned_value = postRelease("nwisgw28") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw28_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw28") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw28" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw28_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw28") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw28" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw28_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw28") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw28" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw28_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw28") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw28" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw28_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw28" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw28"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw28" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw28(): - containers = nwisgw28_getImage() - harvest = nwisgw28_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw28_missingreport_s3(start=harvest) - report_idstat = nwisgw28_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw28_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw28") - load_release = nwisgw28_naburelease(start=harvest) - load_uploadrelease = nwisgw28_uploadrelease(start=load_release) - - load_prune = nwisgw28_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw28_nabuprov(start=load_prune) - load_org = nwisgw28_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw28_missingreport_graph(start=load_org) - report_graph=nwisgw28_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw3.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw3.py deleted file mode 100644 index 2974d5e9..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw3.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw3_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw3_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw3_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw3_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw3_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw3_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw3_uploadrelease(context): - returned_value = postRelease("nwisgw3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw3_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw3") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw3" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw3_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw3") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw3" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw3_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw3") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw3" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw3_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw3") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw3" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw3_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw3" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw3"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw3" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw3(): - containers = nwisgw3_getImage() - harvest = nwisgw3_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw3_missingreport_s3(start=harvest) - report_idstat = nwisgw3_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw3_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw3") - load_release = nwisgw3_naburelease(start=harvest) - load_uploadrelease = nwisgw3_uploadrelease(start=load_release) - - load_prune = nwisgw3_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw3_nabuprov(start=load_prune) - load_org = nwisgw3_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw3_missingreport_graph(start=load_org) - report_graph=nwisgw3_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw4.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw4.py deleted file mode 100644 index fd1fbca9..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw4.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw4_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw4_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw4") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw4_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw4") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw4_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw4") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw4_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw4") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw4_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw4") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw4_uploadrelease(context): - returned_value = postRelease("nwisgw4") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw4_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw4") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw4" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw4_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw4") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw4" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw4_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw4") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw4" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw4_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw4") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw4" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw4_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw4" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw4"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw4" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw4(): - containers = nwisgw4_getImage() - harvest = nwisgw4_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw4_missingreport_s3(start=harvest) - report_idstat = nwisgw4_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw4_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw4") - load_release = nwisgw4_naburelease(start=harvest) - load_uploadrelease = nwisgw4_uploadrelease(start=load_release) - - load_prune = nwisgw4_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw4_nabuprov(start=load_prune) - load_org = nwisgw4_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw4_missingreport_graph(start=load_org) - report_graph=nwisgw4_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw5.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw5.py deleted file mode 100644 index 0c59a791..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw5.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw5_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw5_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw5") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw5_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw5") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw5_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw5") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw5_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw5") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw5_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw5") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw5_uploadrelease(context): - returned_value = postRelease("nwisgw5") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw5_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw5") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw5" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw5_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw5") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw5" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw5_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw5") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw5" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw5_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw5") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw5" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw5_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw5" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw5"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw5" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw5(): - containers = nwisgw5_getImage() - harvest = nwisgw5_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw5_missingreport_s3(start=harvest) - report_idstat = nwisgw5_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw5_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw5") - load_release = nwisgw5_naburelease(start=harvest) - load_uploadrelease = nwisgw5_uploadrelease(start=load_release) - - load_prune = nwisgw5_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw5_nabuprov(start=load_prune) - load_org = nwisgw5_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw5_missingreport_graph(start=load_org) - report_graph=nwisgw5_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw6.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw6.py deleted file mode 100644 index 15ce0c7f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw6.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw6_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw6_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw6") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw6_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw6") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw6_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw6") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw6_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw6") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw6_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw6") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw6_uploadrelease(context): - returned_value = postRelease("nwisgw6") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw6_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw6") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw6" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw6_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw6") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw6" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw6_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw6") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw6" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw6_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw6") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw6" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw6_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw6" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw6"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw6" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw6(): - containers = nwisgw6_getImage() - harvest = nwisgw6_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw6_missingreport_s3(start=harvest) - report_idstat = nwisgw6_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw6_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw6") - load_release = nwisgw6_naburelease(start=harvest) - load_uploadrelease = nwisgw6_uploadrelease(start=load_release) - - load_prune = nwisgw6_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw6_nabuprov(start=load_prune) - load_org = nwisgw6_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw6_missingreport_graph(start=load_org) - report_graph=nwisgw6_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw7.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw7.py deleted file mode 100644 index 50ca251f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw7.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw7_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw7_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw7") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw7_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw7") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw7_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw7") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw7_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw7") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw7_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw7") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw7_uploadrelease(context): - returned_value = postRelease("nwisgw7") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw7_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw7") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw7" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw7_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw7") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw7" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw7_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw7") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw7" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw7_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw7") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw7" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw7_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw7" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw7"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw7" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw7(): - containers = nwisgw7_getImage() - harvest = nwisgw7_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw7_missingreport_s3(start=harvest) - report_idstat = nwisgw7_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw7_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw7") - load_release = nwisgw7_naburelease(start=harvest) - load_uploadrelease = nwisgw7_uploadrelease(start=load_release) - - load_prune = nwisgw7_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw7_nabuprov(start=load_prune) - load_org = nwisgw7_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw7_missingreport_graph(start=load_org) - report_graph=nwisgw7_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw8.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw8.py deleted file mode 100644 index 1bed0045..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw8.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw8_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw8_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw8") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw8_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw8") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw8_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw8") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw8_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw8") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw8_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw8") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw8_uploadrelease(context): - returned_value = postRelease("nwisgw8") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw8_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw8") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw8" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw8_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw8") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw8" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw8_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw8") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw8" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw8_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw8") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw8" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw8_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw8" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw8"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw8" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw8(): - containers = nwisgw8_getImage() - harvest = nwisgw8_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw8_missingreport_s3(start=harvest) - report_idstat = nwisgw8_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw8_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw8") - load_release = nwisgw8_naburelease(start=harvest) - load_uploadrelease = nwisgw8_uploadrelease(start=load_release) - - load_prune = nwisgw8_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw8_nabuprov(start=load_prune) - load_org = nwisgw8_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw8_missingreport_graph(start=load_org) - report_graph=nwisgw8_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw9.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw9.py deleted file mode 100644 index c7c07d5c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwisgw9.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwisgw9_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwisgw9_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwisgw9") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw9_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwisgw9") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw9_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwisgw9") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw9_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwisgw9") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw9_naburelease(context): - returned_value = gleanerio(context,("release"), "nwisgw9") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw9_uploadrelease(context): - returned_value = postRelease("nwisgw9") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwisgw9_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw9") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw9" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw9_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw9") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw9" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwisgw9_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw9") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw9" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw9_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwisgw9") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw9" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwisgw9_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwisgw9" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwisgw9"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwisgw9" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwisgw9(): - containers = nwisgw9_getImage() - harvest = nwisgw9_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwisgw9_missingreport_s3(start=harvest) - report_idstat = nwisgw9_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwisgw9_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwisgw9") - load_release = nwisgw9_naburelease(start=harvest) - load_uploadrelease = nwisgw9_uploadrelease(start=load_release) - - load_prune = nwisgw9_nabu_prune(start=load_uploadrelease) - load_prov = nwisgw9_nabuprov(start=load_prune) - load_org = nwisgw9_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwisgw9_missingreport_graph(start=load_org) - report_graph=nwisgw9_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite0.py deleted file mode 100644 index 84bea693..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwissite0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwissite0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwissite0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwissite0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwissite0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwissite0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite0_naburelease(context): - returned_value = gleanerio(context,("release"), "nwissite0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwissite0_uploadrelease(context): - returned_value = postRelease("nwissite0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwissite0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwissite0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwissite0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwissite0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwissite0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwissite0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwissite0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwissite0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwissite0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwissite0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwissite0(): - containers = nwissite0_getImage() - harvest = nwissite0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwissite0_missingreport_s3(start=harvest) - report_idstat = nwissite0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwissite0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwissite0") - load_release = nwissite0_naburelease(start=harvest) - load_uploadrelease = nwissite0_uploadrelease(start=load_release) - - load_prune = nwissite0_nabu_prune(start=load_uploadrelease) - load_prov = nwissite0_nabuprov(start=load_prune) - load_org = nwissite0_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwissite0_missingreport_graph(start=load_org) - report_graph=nwissite0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite1.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite1.py deleted file mode 100644 index 2fccc6a4..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite1.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwissite1_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwissite1_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwissite1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite1_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwissite1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite1_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwissite1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite1_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwissite1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite1_naburelease(context): - returned_value = gleanerio(context,("release"), "nwissite1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwissite1_uploadrelease(context): - returned_value = postRelease("nwissite1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwissite1_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite1") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwissite1" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwissite1_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite1") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwissite1" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwissite1_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite1") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwissite1" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite1_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite1") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwissite1" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite1_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwissite1" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwissite1"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwissite1" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwissite1(): - containers = nwissite1_getImage() - harvest = nwissite1_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwissite1_missingreport_s3(start=harvest) - report_idstat = nwissite1_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwissite1_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwissite1") - load_release = nwissite1_naburelease(start=harvest) - load_uploadrelease = nwissite1_uploadrelease(start=load_release) - - load_prune = nwissite1_nabu_prune(start=load_uploadrelease) - load_prov = nwissite1_nabuprov(start=load_prune) - load_org = nwissite1_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwissite1_missingreport_graph(start=load_org) - report_graph=nwissite1_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite2.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite2.py deleted file mode 100644 index 29c8c5b4..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite2.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwissite2_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwissite2_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwissite2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite2_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwissite2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite2_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwissite2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite2_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwissite2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite2_naburelease(context): - returned_value = gleanerio(context,("release"), "nwissite2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwissite2_uploadrelease(context): - returned_value = postRelease("nwissite2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwissite2_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite2") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwissite2" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwissite2_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite2") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwissite2" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwissite2_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite2") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwissite2" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite2_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite2") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwissite2" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite2_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwissite2" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwissite2"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwissite2" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwissite2(): - containers = nwissite2_getImage() - harvest = nwissite2_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwissite2_missingreport_s3(start=harvest) - report_idstat = nwissite2_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwissite2_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwissite2") - load_release = nwissite2_naburelease(start=harvest) - load_uploadrelease = nwissite2_uploadrelease(start=load_release) - - load_prune = nwissite2_nabu_prune(start=load_uploadrelease) - load_prov = nwissite2_nabuprov(start=load_prune) - load_org = nwissite2_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwissite2_missingreport_graph(start=load_org) - report_graph=nwissite2_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite3.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite3.py deleted file mode 100644 index bca31ef4..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_nwissite3.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def nwissite3_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def nwissite3_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "nwissite3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite3_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "nwissite3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite3_nabuprov(context): - returned_value = gleanerio(context,("prov"), "nwissite3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite3_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "nwissite3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite3_naburelease(context): - returned_value = gleanerio(context,("release"), "nwissite3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwissite3_uploadrelease(context): - returned_value = postRelease("nwissite3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def nwissite3_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite3") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwissite3" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwissite3_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite3") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwissite3" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def nwissite3_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite3") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwissite3" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite3_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="nwissite3") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwissite3" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def nwissite3_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "nwissite3" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="nwissite3"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="nwissite3" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_nwissite3(): - containers = nwissite3_getImage() - harvest = nwissite3_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = nwissite3_missingreport_s3(start=harvest) - report_idstat = nwissite3_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = nwissite3_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="nwissite3") - load_release = nwissite3_naburelease(start=harvest) - load_uploadrelease = nwissite3_uploadrelease(start=load_release) - - load_prune = nwissite3_nabu_prune(start=load_uploadrelease) - load_prov = nwissite3_nabuprov(start=load_prune) - load_org = nwissite3_nabuorg(start=load_prov) - -# run after load - report_msgraph=nwissite3_missingreport_graph(start=load_org) - report_graph=nwissite3_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_places0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_places0.py deleted file mode 100644 index ffc44452..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_places0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def places0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def places0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "places0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def places0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "places0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def places0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "places0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def places0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "places0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def places0_naburelease(context): - returned_value = gleanerio(context,("release"), "places0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def places0_uploadrelease(context): - returned_value = postRelease("places0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def places0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="places0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "places0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def places0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="places0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "places0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def places0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="places0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "places0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def places0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="places0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "places0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def places0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "places0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="places0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="places0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_places0(): - containers = places0_getImage() - harvest = places0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = places0_missingreport_s3(start=harvest) - report_idstat = places0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = places0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="places0") - load_release = places0_naburelease(start=harvest) - load_uploadrelease = places0_uploadrelease(start=load_release) - - load_prune = places0_nabu_prune(start=load_uploadrelease) - load_prov = places0_nabuprov(start=load_prune) - load_org = places0_nabuorg(start=load_prov) - -# run after load - report_msgraph=places0_missingreport_graph(start=load_org) - report_graph=places0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_princiaq0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_princiaq0.py deleted file mode 100644 index 0f9a8663..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_princiaq0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def princiaq0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def princiaq0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "princiaq0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def princiaq0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "princiaq0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def princiaq0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "princiaq0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def princiaq0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "princiaq0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def princiaq0_naburelease(context): - returned_value = gleanerio(context,("release"), "princiaq0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def princiaq0_uploadrelease(context): - returned_value = postRelease("princiaq0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def princiaq0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="princiaq0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "princiaq0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def princiaq0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="princiaq0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "princiaq0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def princiaq0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="princiaq0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "princiaq0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def princiaq0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="princiaq0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "princiaq0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def princiaq0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "princiaq0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="princiaq0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="princiaq0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_princiaq0(): - containers = princiaq0_getImage() - harvest = princiaq0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = princiaq0_missingreport_s3(start=harvest) - report_idstat = princiaq0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = princiaq0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="princiaq0") - load_release = princiaq0_naburelease(start=harvest) - load_uploadrelease = princiaq0_uploadrelease(start=load_release) - - load_prune = princiaq0_nabu_prune(start=load_uploadrelease) - load_prov = princiaq0_nabuprov(start=load_prune) - load_org = princiaq0_nabuorg(start=load_prov) - -# run after load - report_msgraph=princiaq0_missingreport_graph(start=load_org) - report_graph=princiaq0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_pws0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_pws0.py deleted file mode 100644 index 673d3d59..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_pws0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def pws0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def pws0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "pws0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def pws0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "pws0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def pws0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "pws0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def pws0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "pws0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def pws0_naburelease(context): - returned_value = gleanerio(context,("release"), "pws0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def pws0_uploadrelease(context): - returned_value = postRelease("pws0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def pws0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="pws0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "pws0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def pws0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="pws0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "pws0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def pws0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="pws0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "pws0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def pws0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="pws0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "pws0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def pws0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "pws0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="pws0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="pws0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_pws0(): - containers = pws0_getImage() - harvest = pws0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = pws0_missingreport_s3(start=harvest) - report_idstat = pws0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = pws0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="pws0") - load_release = pws0_naburelease(start=harvest) - load_uploadrelease = pws0_uploadrelease(start=load_release) - - load_prune = pws0_nabu_prune(start=load_uploadrelease) - load_prov = pws0_nabuprov(start=load_prune) - load_org = pws0_nabuorg(start=load_prov) - -# run after load - report_msgraph=pws0_missingreport_graph(start=load_org) - report_graph=pws0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage0.py deleted file mode 100644 index 99eb59f6..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def refgage0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def refgage0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "refgage0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "refgage0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "refgage0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "refgage0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage0_naburelease(context): - returned_value = gleanerio(context,("release"), "refgage0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def refgage0_uploadrelease(context): - returned_value = postRelease("refgage0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def refgage0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "refgage0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def refgage0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "refgage0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def refgage0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "refgage0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "refgage0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "refgage0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="refgage0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="refgage0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_refgage0(): - containers = refgage0_getImage() - harvest = refgage0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = refgage0_missingreport_s3(start=harvest) - report_idstat = refgage0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = refgage0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="refgage0") - load_release = refgage0_naburelease(start=harvest) - load_uploadrelease = refgage0_uploadrelease(start=load_release) - - load_prune = refgage0_nabu_prune(start=load_uploadrelease) - load_prov = refgage0_nabuprov(start=load_prune) - load_org = refgage0_nabuorg(start=load_prov) - -# run after load - report_msgraph=refgage0_missingreport_graph(start=load_org) - report_graph=refgage0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage1.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage1.py deleted file mode 100644 index 00d8d18e..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage1.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def refgage1_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def refgage1_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "refgage1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage1_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "refgage1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage1_nabuprov(context): - returned_value = gleanerio(context,("prov"), "refgage1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage1_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "refgage1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage1_naburelease(context): - returned_value = gleanerio(context,("release"), "refgage1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def refgage1_uploadrelease(context): - returned_value = postRelease("refgage1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def refgage1_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage1") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "refgage1" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def refgage1_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage1") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "refgage1" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def refgage1_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage1") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "refgage1" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage1_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage1") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "refgage1" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage1_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "refgage1" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="refgage1"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="refgage1" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_refgage1(): - containers = refgage1_getImage() - harvest = refgage1_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = refgage1_missingreport_s3(start=harvest) - report_idstat = refgage1_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = refgage1_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="refgage1") - load_release = refgage1_naburelease(start=harvest) - load_uploadrelease = refgage1_uploadrelease(start=load_release) - - load_prune = refgage1_nabu_prune(start=load_uploadrelease) - load_prov = refgage1_nabuprov(start=load_prune) - load_org = refgage1_nabuorg(start=load_prov) - -# run after load - report_msgraph=refgage1_missingreport_graph(start=load_org) - report_graph=refgage1_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage2.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage2.py deleted file mode 100644 index c2c46b6c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage2.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def refgage2_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def refgage2_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "refgage2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage2_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "refgage2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage2_nabuprov(context): - returned_value = gleanerio(context,("prov"), "refgage2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage2_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "refgage2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage2_naburelease(context): - returned_value = gleanerio(context,("release"), "refgage2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def refgage2_uploadrelease(context): - returned_value = postRelease("refgage2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def refgage2_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage2") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "refgage2" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def refgage2_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage2") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "refgage2" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def refgage2_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage2") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "refgage2" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage2_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage2") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "refgage2" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage2_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "refgage2" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="refgage2"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="refgage2" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_refgage2(): - containers = refgage2_getImage() - harvest = refgage2_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = refgage2_missingreport_s3(start=harvest) - report_idstat = refgage2_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = refgage2_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="refgage2") - load_release = refgage2_naburelease(start=harvest) - load_uploadrelease = refgage2_uploadrelease(start=load_release) - - load_prune = refgage2_nabu_prune(start=load_uploadrelease) - load_prov = refgage2_nabuprov(start=load_prune) - load_org = refgage2_nabuorg(start=load_prov) - -# run after load - report_msgraph=refgage2_missingreport_graph(start=load_org) - report_graph=refgage2_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage3.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage3.py deleted file mode 100644 index e67f7b4d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_refgage3.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def refgage3_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def refgage3_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "refgage3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage3_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "refgage3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage3_nabuprov(context): - returned_value = gleanerio(context,("prov"), "refgage3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage3_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "refgage3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage3_naburelease(context): - returned_value = gleanerio(context,("release"), "refgage3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def refgage3_uploadrelease(context): - returned_value = postRelease("refgage3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def refgage3_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage3") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "refgage3" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def refgage3_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage3") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "refgage3" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def refgage3_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage3") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "refgage3" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage3_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="refgage3") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "refgage3" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def refgage3_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "refgage3" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="refgage3"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="refgage3" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_refgage3(): - containers = refgage3_getImage() - harvest = refgage3_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = refgage3_missingreport_s3(start=harvest) - report_idstat = refgage3_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = refgage3_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="refgage3") - load_release = refgage3_naburelease(start=harvest) - load_uploadrelease = refgage3_uploadrelease(start=load_release) - - load_prune = refgage3_nabu_prune(start=load_uploadrelease) - load_prov = refgage3_nabuprov(start=load_prune) - load_org = refgage3_nabuorg(start=load_prov) - -# run after load - report_msgraph=refgage3_missingreport_graph(start=load_org) - report_graph=refgage3_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_rise0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_rise0.py deleted file mode 100644 index c6e69dc7..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_rise0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def rise0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def rise0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "rise0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def rise0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "rise0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def rise0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "rise0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def rise0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "rise0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def rise0_naburelease(context): - returned_value = gleanerio(context,("release"), "rise0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def rise0_uploadrelease(context): - returned_value = postRelease("rise0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def rise0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="rise0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "rise0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def rise0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="rise0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "rise0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def rise0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="rise0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "rise0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def rise0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="rise0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "rise0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def rise0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "rise0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="rise0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="rise0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_rise0(): - containers = rise0_getImage() - harvest = rise0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = rise0_missingreport_s3(start=harvest) - report_idstat = rise0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = rise0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="rise0") - load_release = rise0_naburelease(start=harvest) - load_uploadrelease = rise0_uploadrelease(start=load_release) - - load_prune = rise0_nabu_prune(start=load_uploadrelease) - load_prov = rise0_nabuprov(start=load_prune) - load_org = rise0_nabuorg(start=load_prov) - -# run after load - report_msgraph=rise0_missingreport_graph(start=load_org) - report_graph=rise0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_sechydrgreg0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_sechydrgreg0.py deleted file mode 100644 index a1148b8f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_sechydrgreg0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def sechydrgreg0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def sechydrgreg0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "sechydrgreg0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def sechydrgreg0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "sechydrgreg0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def sechydrgreg0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "sechydrgreg0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def sechydrgreg0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "sechydrgreg0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def sechydrgreg0_naburelease(context): - returned_value = gleanerio(context,("release"), "sechydrgreg0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def sechydrgreg0_uploadrelease(context): - returned_value = postRelease("sechydrgreg0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def sechydrgreg0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="sechydrgreg0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "sechydrgreg0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def sechydrgreg0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="sechydrgreg0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "sechydrgreg0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def sechydrgreg0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="sechydrgreg0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "sechydrgreg0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def sechydrgreg0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="sechydrgreg0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "sechydrgreg0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def sechydrgreg0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "sechydrgreg0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="sechydrgreg0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="sechydrgreg0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_sechydrgreg0(): - containers = sechydrgreg0_getImage() - harvest = sechydrgreg0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = sechydrgreg0_missingreport_s3(start=harvest) - report_idstat = sechydrgreg0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = sechydrgreg0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="sechydrgreg0") - load_release = sechydrgreg0_naburelease(start=harvest) - load_uploadrelease = sechydrgreg0_uploadrelease(start=load_release) - - load_prune = sechydrgreg0_nabu_prune(start=load_uploadrelease) - load_prov = sechydrgreg0_nabuprov(start=load_prune) - load_org = sechydrgreg0_nabuorg(start=load_prov) - -# run after load - report_msgraph=sechydrgreg0_missingreport_graph(start=load_org) - report_graph=sechydrgreg0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_selfieids0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_selfieids0.py deleted file mode 100644 index ea9c7fc3..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_selfieids0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def selfieids0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def selfieids0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "selfieids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def selfieids0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "selfieids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def selfieids0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "selfieids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def selfieids0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "selfieids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def selfieids0_naburelease(context): - returned_value = gleanerio(context,("release"), "selfieids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def selfieids0_uploadrelease(context): - returned_value = postRelease("selfieids0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def selfieids0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="selfieids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "selfieids0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def selfieids0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="selfieids0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "selfieids0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def selfieids0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="selfieids0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "selfieids0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def selfieids0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="selfieids0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "selfieids0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def selfieids0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "selfieids0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="selfieids0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="selfieids0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_selfieids0(): - containers = selfieids0_getImage() - harvest = selfieids0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = selfieids0_missingreport_s3(start=harvest) - report_idstat = selfieids0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = selfieids0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="selfieids0") - load_release = selfieids0_naburelease(start=harvest) - load_uploadrelease = selfieids0_uploadrelease(start=load_release) - - load_prune = selfieids0_nabu_prune(start=load_uploadrelease) - load_prov = selfieids0_nabuprov(start=load_prune) - load_org = selfieids0_nabuorg(start=load_prov) - -# run after load - report_msgraph=selfieids0_missingreport_graph(start=load_org) - report_graph=selfieids0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_states0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_states0.py deleted file mode 100644 index aeff1252..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_states0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def states0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def states0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "states0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def states0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "states0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def states0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "states0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def states0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "states0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def states0_naburelease(context): - returned_value = gleanerio(context,("release"), "states0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def states0_uploadrelease(context): - returned_value = postRelease("states0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def states0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="states0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "states0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def states0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="states0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "states0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def states0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="states0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "states0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def states0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="states0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "states0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def states0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "states0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="states0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="states0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_states0(): - containers = states0_getImage() - harvest = states0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = states0_missingreport_s3(start=harvest) - report_idstat = states0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = states0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="states0") - load_release = states0_naburelease(start=harvest) - load_uploadrelease = states0_uploadrelease(start=load_release) - - load_prune = states0_nabu_prune(start=load_uploadrelease) - load_prov = states0_nabuprov(start=load_prune) - load_org = states0_nabuorg(start=load_prov) - -# run after load - report_msgraph=states0_missingreport_graph(start=load_org) - report_graph=states0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_ua100.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_ua100.py deleted file mode 100644 index 160d7c96..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_ua100.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def ua100_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def ua100_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "ua100") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def ua100_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "ua100") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def ua100_nabuprov(context): - returned_value = gleanerio(context,("prov"), "ua100") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def ua100_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "ua100") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def ua100_naburelease(context): - returned_value = gleanerio(context,("release"), "ua100") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def ua100_uploadrelease(context): - returned_value = postRelease("ua100") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def ua100_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="ua100") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "ua100" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def ua100_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="ua100") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "ua100" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def ua100_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="ua100") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "ua100" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def ua100_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="ua100") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "ua100" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def ua100_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "ua100" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="ua100"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="ua100" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_ua100(): - containers = ua100_getImage() - harvest = ua100_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = ua100_missingreport_s3(start=harvest) - report_idstat = ua100_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = ua100_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="ua100") - load_release = ua100_naburelease(start=harvest) - load_uploadrelease = ua100_uploadrelease(start=load_release) - - load_prune = ua100_nabu_prune(start=load_uploadrelease) - load_prov = ua100_nabuprov(start=load_prune) - load_org = ua100_nabuorg(start=load_prov) - -# run after load - report_msgraph=ua100_missingreport_graph(start=load_org) - report_graph=ua100_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade0.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade0.py deleted file mode 100644 index 3015cf37..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade0.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def wade0_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def wade0_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "wade0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade0_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "wade0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade0_nabuprov(context): - returned_value = gleanerio(context,("prov"), "wade0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade0_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "wade0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade0_naburelease(context): - returned_value = gleanerio(context,("release"), "wade0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade0_uploadrelease(context): - returned_value = postRelease("wade0") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def wade0_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade0" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade0_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade0") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade0" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade0_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade0") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade0" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade0_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade0") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade0" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade0_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade0" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="wade0"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="wade0" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_wade0(): - containers = wade0_getImage() - harvest = wade0_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = wade0_missingreport_s3(start=harvest) - report_idstat = wade0_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = wade0_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="wade0") - load_release = wade0_naburelease(start=harvest) - load_uploadrelease = wade0_uploadrelease(start=load_release) - - load_prune = wade0_nabu_prune(start=load_uploadrelease) - load_prov = wade0_nabuprov(start=load_prune) - load_org = wade0_nabuorg(start=load_prov) - -# run after load - report_msgraph=wade0_missingreport_graph(start=load_org) - report_graph=wade0_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade1.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade1.py deleted file mode 100644 index 52979dd5..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade1.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def wade1_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def wade1_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "wade1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade1_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "wade1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade1_nabuprov(context): - returned_value = gleanerio(context,("prov"), "wade1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade1_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "wade1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade1_naburelease(context): - returned_value = gleanerio(context,("release"), "wade1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade1_uploadrelease(context): - returned_value = postRelease("wade1") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def wade1_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade1") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade1" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade1_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade1") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade1" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade1_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade1") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade1" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade1_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade1") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade1" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade1_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade1" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="wade1"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="wade1" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_wade1(): - containers = wade1_getImage() - harvest = wade1_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = wade1_missingreport_s3(start=harvest) - report_idstat = wade1_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = wade1_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="wade1") - load_release = wade1_naburelease(start=harvest) - load_uploadrelease = wade1_uploadrelease(start=load_release) - - load_prune = wade1_nabu_prune(start=load_uploadrelease) - load_prov = wade1_nabuprov(start=load_prune) - load_org = wade1_nabuorg(start=load_prov) - -# run after load - report_msgraph=wade1_missingreport_graph(start=load_org) - report_graph=wade1_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade10.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade10.py deleted file mode 100644 index eecc625a..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade10.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def wade10_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def wade10_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "wade10") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade10_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "wade10") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade10_nabuprov(context): - returned_value = gleanerio(context,("prov"), "wade10") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade10_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "wade10") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade10_naburelease(context): - returned_value = gleanerio(context,("release"), "wade10") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade10_uploadrelease(context): - returned_value = postRelease("wade10") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def wade10_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade10") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade10" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade10_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade10") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade10" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade10_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade10") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade10" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade10_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade10") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade10" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade10_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade10" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="wade10"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="wade10" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_wade10(): - containers = wade10_getImage() - harvest = wade10_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = wade10_missingreport_s3(start=harvest) - report_idstat = wade10_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = wade10_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="wade10") - load_release = wade10_naburelease(start=harvest) - load_uploadrelease = wade10_uploadrelease(start=load_release) - - load_prune = wade10_nabu_prune(start=load_uploadrelease) - load_prov = wade10_nabuprov(start=load_prune) - load_org = wade10_nabuorg(start=load_prov) - -# run after load - report_msgraph=wade10_missingreport_graph(start=load_org) - report_graph=wade10_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade11.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade11.py deleted file mode 100644 index 915ab299..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade11.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def wade11_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def wade11_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "wade11") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade11_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "wade11") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade11_nabuprov(context): - returned_value = gleanerio(context,("prov"), "wade11") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade11_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "wade11") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade11_naburelease(context): - returned_value = gleanerio(context,("release"), "wade11") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade11_uploadrelease(context): - returned_value = postRelease("wade11") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def wade11_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade11") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade11" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade11_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade11") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade11" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade11_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade11") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade11" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade11_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade11") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade11" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade11_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade11" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="wade11"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="wade11" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_wade11(): - containers = wade11_getImage() - harvest = wade11_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = wade11_missingreport_s3(start=harvest) - report_idstat = wade11_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = wade11_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="wade11") - load_release = wade11_naburelease(start=harvest) - load_uploadrelease = wade11_uploadrelease(start=load_release) - - load_prune = wade11_nabu_prune(start=load_uploadrelease) - load_prov = wade11_nabuprov(start=load_prune) - load_org = wade11_nabuorg(start=load_prov) - -# run after load - report_msgraph=wade11_missingreport_graph(start=load_org) - report_graph=wade11_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade12.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade12.py deleted file mode 100644 index d267c955..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade12.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def wade12_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def wade12_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "wade12") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade12_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "wade12") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade12_nabuprov(context): - returned_value = gleanerio(context,("prov"), "wade12") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade12_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "wade12") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade12_naburelease(context): - returned_value = gleanerio(context,("release"), "wade12") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade12_uploadrelease(context): - returned_value = postRelease("wade12") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def wade12_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade12") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade12" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade12_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade12") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade12" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade12_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade12") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade12" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade12_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade12") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade12" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade12_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade12" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="wade12"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="wade12" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_wade12(): - containers = wade12_getImage() - harvest = wade12_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = wade12_missingreport_s3(start=harvest) - report_idstat = wade12_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = wade12_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="wade12") - load_release = wade12_naburelease(start=harvest) - load_uploadrelease = wade12_uploadrelease(start=load_release) - - load_prune = wade12_nabu_prune(start=load_uploadrelease) - load_prov = wade12_nabuprov(start=load_prune) - load_org = wade12_nabuorg(start=load_prov) - -# run after load - report_msgraph=wade12_missingreport_graph(start=load_org) - report_graph=wade12_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade13.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade13.py deleted file mode 100644 index 1b0701bd..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade13.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def wade13_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def wade13_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "wade13") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade13_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "wade13") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade13_nabuprov(context): - returned_value = gleanerio(context,("prov"), "wade13") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade13_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "wade13") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade13_naburelease(context): - returned_value = gleanerio(context,("release"), "wade13") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade13_uploadrelease(context): - returned_value = postRelease("wade13") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def wade13_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade13") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade13" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade13_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade13") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade13" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade13_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade13") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade13" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade13_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade13") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade13" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade13_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade13" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="wade13"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="wade13" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_wade13(): - containers = wade13_getImage() - harvest = wade13_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = wade13_missingreport_s3(start=harvest) - report_idstat = wade13_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = wade13_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="wade13") - load_release = wade13_naburelease(start=harvest) - load_uploadrelease = wade13_uploadrelease(start=load_release) - - load_prune = wade13_nabu_prune(start=load_uploadrelease) - load_prov = wade13_nabuprov(start=load_prune) - load_org = wade13_nabuorg(start=load_prov) - -# run after load - report_msgraph=wade13_missingreport_graph(start=load_org) - report_graph=wade13_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade14.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade14.py deleted file mode 100644 index 65d7d3ad..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade14.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def wade14_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def wade14_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "wade14") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade14_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "wade14") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade14_nabuprov(context): - returned_value = gleanerio(context,("prov"), "wade14") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade14_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "wade14") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade14_naburelease(context): - returned_value = gleanerio(context,("release"), "wade14") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade14_uploadrelease(context): - returned_value = postRelease("wade14") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def wade14_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade14") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade14" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade14_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade14") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade14" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade14_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade14") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade14" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade14_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade14") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade14" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade14_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade14" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="wade14"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="wade14" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_wade14(): - containers = wade14_getImage() - harvest = wade14_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = wade14_missingreport_s3(start=harvest) - report_idstat = wade14_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = wade14_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="wade14") - load_release = wade14_naburelease(start=harvest) - load_uploadrelease = wade14_uploadrelease(start=load_release) - - load_prune = wade14_nabu_prune(start=load_uploadrelease) - load_prov = wade14_nabuprov(start=load_prune) - load_org = wade14_nabuorg(start=load_prov) - -# run after load - report_msgraph=wade14_missingreport_graph(start=load_org) - report_graph=wade14_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade15.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade15.py deleted file mode 100644 index 97cee9b6..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade15.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def wade15_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def wade15_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "wade15") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade15_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "wade15") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade15_nabuprov(context): - returned_value = gleanerio(context,("prov"), "wade15") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade15_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "wade15") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade15_naburelease(context): - returned_value = gleanerio(context,("release"), "wade15") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade15_uploadrelease(context): - returned_value = postRelease("wade15") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def wade15_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade15") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade15" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade15_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade15") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade15" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade15_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade15") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade15" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade15_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade15") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade15" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade15_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade15" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="wade15"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="wade15" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_wade15(): - containers = wade15_getImage() - harvest = wade15_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = wade15_missingreport_s3(start=harvest) - report_idstat = wade15_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = wade15_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="wade15") - load_release = wade15_naburelease(start=harvest) - load_uploadrelease = wade15_uploadrelease(start=load_release) - - load_prune = wade15_nabu_prune(start=load_uploadrelease) - load_prov = wade15_nabuprov(start=load_prune) - load_org = wade15_nabuorg(start=load_prov) - -# run after load - report_msgraph=wade15_missingreport_graph(start=load_org) - report_graph=wade15_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade16.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade16.py deleted file mode 100644 index 6951b8ab..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade16.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def wade16_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def wade16_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "wade16") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade16_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "wade16") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade16_nabuprov(context): - returned_value = gleanerio(context,("prov"), "wade16") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade16_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "wade16") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade16_naburelease(context): - returned_value = gleanerio(context,("release"), "wade16") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade16_uploadrelease(context): - returned_value = postRelease("wade16") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def wade16_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade16") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade16" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade16_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade16") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade16" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade16_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade16") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade16" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade16_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade16") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade16" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade16_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade16" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="wade16"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="wade16" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_wade16(): - containers = wade16_getImage() - harvest = wade16_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = wade16_missingreport_s3(start=harvest) - report_idstat = wade16_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = wade16_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="wade16") - load_release = wade16_naburelease(start=harvest) - load_uploadrelease = wade16_uploadrelease(start=load_release) - - load_prune = wade16_nabu_prune(start=load_uploadrelease) - load_prov = wade16_nabuprov(start=load_prune) - load_org = wade16_nabuorg(start=load_prov) - -# run after load - report_msgraph=wade16_missingreport_graph(start=load_org) - report_graph=wade16_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade17.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade17.py deleted file mode 100644 index 8ce081ef..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade17.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def wade17_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def wade17_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "wade17") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade17_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "wade17") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade17_nabuprov(context): - returned_value = gleanerio(context,("prov"), "wade17") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade17_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "wade17") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade17_naburelease(context): - returned_value = gleanerio(context,("release"), "wade17") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade17_uploadrelease(context): - returned_value = postRelease("wade17") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def wade17_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade17") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade17" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade17_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade17") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade17" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade17_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade17") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade17" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade17_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade17") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade17" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade17_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade17" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="wade17"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="wade17" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_wade17(): - containers = wade17_getImage() - harvest = wade17_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = wade17_missingreport_s3(start=harvest) - report_idstat = wade17_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = wade17_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="wade17") - load_release = wade17_naburelease(start=harvest) - load_uploadrelease = wade17_uploadrelease(start=load_release) - - load_prune = wade17_nabu_prune(start=load_uploadrelease) - load_prov = wade17_nabuprov(start=load_prune) - load_org = wade17_nabuorg(start=load_prov) - -# run after load - report_msgraph=wade17_missingreport_graph(start=load_org) - report_graph=wade17_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade18.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade18.py deleted file mode 100644 index 0bdeeca0..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade18.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def wade18_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def wade18_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "wade18") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade18_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "wade18") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade18_nabuprov(context): - returned_value = gleanerio(context,("prov"), "wade18") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade18_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "wade18") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade18_naburelease(context): - returned_value = gleanerio(context,("release"), "wade18") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade18_uploadrelease(context): - returned_value = postRelease("wade18") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def wade18_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade18") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade18" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade18_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade18") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade18" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade18_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade18") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade18" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade18_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade18") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade18" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade18_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade18" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="wade18"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="wade18" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_wade18(): - containers = wade18_getImage() - harvest = wade18_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = wade18_missingreport_s3(start=harvest) - report_idstat = wade18_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = wade18_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="wade18") - load_release = wade18_naburelease(start=harvest) - load_uploadrelease = wade18_uploadrelease(start=load_release) - - load_prune = wade18_nabu_prune(start=load_uploadrelease) - load_prov = wade18_nabuprov(start=load_prune) - load_org = wade18_nabuorg(start=load_prov) - -# run after load - report_msgraph=wade18_missingreport_graph(start=load_org) - report_graph=wade18_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade19.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade19.py deleted file mode 100644 index 96d8f2e6..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade19.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def wade19_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def wade19_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "wade19") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade19_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "wade19") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade19_nabuprov(context): - returned_value = gleanerio(context,("prov"), "wade19") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade19_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "wade19") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade19_naburelease(context): - returned_value = gleanerio(context,("release"), "wade19") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade19_uploadrelease(context): - returned_value = postRelease("wade19") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def wade19_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade19") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade19" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade19_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade19") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade19" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade19_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade19") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade19" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade19_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade19") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade19" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade19_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade19" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="wade19"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="wade19" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_wade19(): - containers = wade19_getImage() - harvest = wade19_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = wade19_missingreport_s3(start=harvest) - report_idstat = wade19_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = wade19_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="wade19") - load_release = wade19_naburelease(start=harvest) - load_uploadrelease = wade19_uploadrelease(start=load_release) - - load_prune = wade19_nabu_prune(start=load_uploadrelease) - load_prov = wade19_nabuprov(start=load_prune) - load_org = wade19_nabuorg(start=load_prov) - -# run after load - report_msgraph=wade19_missingreport_graph(start=load_org) - report_graph=wade19_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade2.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade2.py deleted file mode 100644 index d231a935..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade2.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def wade2_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def wade2_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "wade2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade2_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "wade2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade2_nabuprov(context): - returned_value = gleanerio(context,("prov"), "wade2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade2_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "wade2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade2_naburelease(context): - returned_value = gleanerio(context,("release"), "wade2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade2_uploadrelease(context): - returned_value = postRelease("wade2") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def wade2_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade2") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade2" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade2_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade2") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade2" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade2_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade2") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade2" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade2_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade2") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade2" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade2_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade2" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="wade2"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="wade2" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_wade2(): - containers = wade2_getImage() - harvest = wade2_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = wade2_missingreport_s3(start=harvest) - report_idstat = wade2_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = wade2_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="wade2") - load_release = wade2_naburelease(start=harvest) - load_uploadrelease = wade2_uploadrelease(start=load_release) - - load_prune = wade2_nabu_prune(start=load_uploadrelease) - load_prov = wade2_nabuprov(start=load_prune) - load_org = wade2_nabuorg(start=load_prov) - -# run after load - report_msgraph=wade2_missingreport_graph(start=load_org) - report_graph=wade2_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade3.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade3.py deleted file mode 100644 index 60f2348a..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade3.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def wade3_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def wade3_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "wade3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade3_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "wade3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade3_nabuprov(context): - returned_value = gleanerio(context,("prov"), "wade3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade3_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "wade3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade3_naburelease(context): - returned_value = gleanerio(context,("release"), "wade3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade3_uploadrelease(context): - returned_value = postRelease("wade3") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def wade3_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade3") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade3" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade3_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade3") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade3" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade3_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade3") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade3" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade3_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade3") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade3" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade3_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade3" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="wade3"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="wade3" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_wade3(): - containers = wade3_getImage() - harvest = wade3_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = wade3_missingreport_s3(start=harvest) - report_idstat = wade3_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = wade3_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="wade3") - load_release = wade3_naburelease(start=harvest) - load_uploadrelease = wade3_uploadrelease(start=load_release) - - load_prune = wade3_nabu_prune(start=load_uploadrelease) - load_prov = wade3_nabuprov(start=load_prune) - load_org = wade3_nabuorg(start=load_prov) - -# run after load - report_msgraph=wade3_missingreport_graph(start=load_org) - report_graph=wade3_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade4.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade4.py deleted file mode 100644 index 15dd87e8..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade4.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def wade4_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def wade4_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "wade4") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade4_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "wade4") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade4_nabuprov(context): - returned_value = gleanerio(context,("prov"), "wade4") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade4_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "wade4") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade4_naburelease(context): - returned_value = gleanerio(context,("release"), "wade4") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade4_uploadrelease(context): - returned_value = postRelease("wade4") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def wade4_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade4") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade4" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade4_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade4") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade4" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade4_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade4") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade4" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade4_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade4") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade4" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade4_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade4" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="wade4"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="wade4" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_wade4(): - containers = wade4_getImage() - harvest = wade4_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = wade4_missingreport_s3(start=harvest) - report_idstat = wade4_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = wade4_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="wade4") - load_release = wade4_naburelease(start=harvest) - load_uploadrelease = wade4_uploadrelease(start=load_release) - - load_prune = wade4_nabu_prune(start=load_uploadrelease) - load_prov = wade4_nabuprov(start=load_prune) - load_org = wade4_nabuorg(start=load_prov) - -# run after load - report_msgraph=wade4_missingreport_graph(start=load_org) - report_graph=wade4_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade5.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade5.py deleted file mode 100644 index 4c7e0c82..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade5.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def wade5_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def wade5_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "wade5") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade5_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "wade5") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade5_nabuprov(context): - returned_value = gleanerio(context,("prov"), "wade5") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade5_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "wade5") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade5_naburelease(context): - returned_value = gleanerio(context,("release"), "wade5") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade5_uploadrelease(context): - returned_value = postRelease("wade5") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def wade5_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade5") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade5" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade5_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade5") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade5" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade5_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade5") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade5" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade5_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade5") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade5" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade5_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade5" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="wade5"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="wade5" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_wade5(): - containers = wade5_getImage() - harvest = wade5_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = wade5_missingreport_s3(start=harvest) - report_idstat = wade5_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = wade5_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="wade5") - load_release = wade5_naburelease(start=harvest) - load_uploadrelease = wade5_uploadrelease(start=load_release) - - load_prune = wade5_nabu_prune(start=load_uploadrelease) - load_prov = wade5_nabuprov(start=load_prune) - load_org = wade5_nabuorg(start=load_prov) - -# run after load - report_msgraph=wade5_missingreport_graph(start=load_org) - report_graph=wade5_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade6.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade6.py deleted file mode 100644 index ca3e0953..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade6.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def wade6_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def wade6_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "wade6") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade6_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "wade6") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade6_nabuprov(context): - returned_value = gleanerio(context,("prov"), "wade6") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade6_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "wade6") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade6_naburelease(context): - returned_value = gleanerio(context,("release"), "wade6") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade6_uploadrelease(context): - returned_value = postRelease("wade6") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def wade6_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade6") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade6" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade6_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade6") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade6" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade6_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade6") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade6" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade6_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade6") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade6" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade6_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade6" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="wade6"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="wade6" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_wade6(): - containers = wade6_getImage() - harvest = wade6_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = wade6_missingreport_s3(start=harvest) - report_idstat = wade6_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = wade6_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="wade6") - load_release = wade6_naburelease(start=harvest) - load_uploadrelease = wade6_uploadrelease(start=load_release) - - load_prune = wade6_nabu_prune(start=load_uploadrelease) - load_prov = wade6_nabuprov(start=load_prune) - load_org = wade6_nabuorg(start=load_prov) - -# run after load - report_msgraph=wade6_missingreport_graph(start=load_org) - report_graph=wade6_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade7.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade7.py deleted file mode 100644 index 2b7294df..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade7.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def wade7_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def wade7_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "wade7") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade7_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "wade7") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade7_nabuprov(context): - returned_value = gleanerio(context,("prov"), "wade7") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade7_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "wade7") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade7_naburelease(context): - returned_value = gleanerio(context,("release"), "wade7") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade7_uploadrelease(context): - returned_value = postRelease("wade7") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def wade7_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade7") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade7" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade7_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade7") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade7" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade7_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade7") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade7" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade7_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade7") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade7" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade7_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade7" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="wade7"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="wade7" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_wade7(): - containers = wade7_getImage() - harvest = wade7_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = wade7_missingreport_s3(start=harvest) - report_idstat = wade7_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = wade7_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="wade7") - load_release = wade7_naburelease(start=harvest) - load_uploadrelease = wade7_uploadrelease(start=load_release) - - load_prune = wade7_nabu_prune(start=load_uploadrelease) - load_prov = wade7_nabuprov(start=load_prune) - load_org = wade7_nabuorg(start=load_prov) - -# run after load - report_msgraph=wade7_missingreport_graph(start=load_org) - report_graph=wade7_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade8.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade8.py deleted file mode 100644 index 3acff804..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade8.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def wade8_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def wade8_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "wade8") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade8_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "wade8") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade8_nabuprov(context): - returned_value = gleanerio(context,("prov"), "wade8") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade8_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "wade8") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade8_naburelease(context): - returned_value = gleanerio(context,("release"), "wade8") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade8_uploadrelease(context): - returned_value = postRelease("wade8") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def wade8_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade8") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade8" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade8_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade8") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade8" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade8_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade8") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade8" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade8_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade8") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade8" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade8_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade8" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="wade8"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="wade8" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_wade8(): - containers = wade8_getImage() - harvest = wade8_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = wade8_missingreport_s3(start=harvest) - report_idstat = wade8_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = wade8_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="wade8") - load_release = wade8_naburelease(start=harvest) - load_uploadrelease = wade8_uploadrelease(start=load_release) - - load_prune = wade8_nabu_prune(start=load_uploadrelease) - load_prov = wade8_nabuprov(start=load_prune) - load_org = wade8_nabuorg(start=load_prov) - -# run after load - report_msgraph=wade8_missingreport_graph(start=load_org) - report_graph=wade8_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade9.py b/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade9.py deleted file mode 100644 index dbf4c12e..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/ops/implnet_ops_wade9.py +++ /dev/null @@ -1,728 +0,0 @@ -import distutils -import time - -from dagster import job, op, graph,In, Nothing, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError - -from docker.types import RestartPolicy, ServiceMode -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image -from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANERIO_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) -GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) -GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) -GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) -GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) -GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) -GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) -GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) -GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) -GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner -GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) -GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) -GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) -GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) -def _graphEndpoint(): - url = f"{GLEANER_GRAPH_URL}/namespace/{GLEANER_GRAPH_NAMESPACE}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) + ":" + GLEANER_MINIO_PORT - get_dagster_logger().info(f"S3 URL : {GLEANER_MINIO_ADDRESS}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {GLEANER_MINIO_PORT}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {GLEANER_MINIO_BUCKET}") - get_dagster_logger().debug(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = GLEANER_MINIO_USE_SSL, - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= GLEANER_MINIO_USE_SSL - if (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "80" - and secure == False): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - elif (GLEANER_MINIO_PORT and GLEANER_MINIO_PORT == "443" - and secure == True): - server = _pythonMinioUrl(GLEANER_MINIO_ADDRESS) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(GLEANER_MINIO_ADDRESS)}:{GLEANER_MINIO_PORT}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=GLEANER_MINIO_ACCESS_KEY, - secret_key=GLEANER_MINIO_SECRET_KEY, - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = GLEANERIO_LOG_PREFIX + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if GLEANER_MINIO_USE_SSL: - proto = "https" - port = GLEANER_MINIO_PORT - address = GLEANER_MINIO_ADDRESS - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"create docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _create_service( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="", - workingdir="/", - -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"create docker service for {name}") - ## thoguhts - # return service, container, since there is one - restart_policy = RestartPolicy(condition='none') - # docker.py if replicated job, total completions = replicas - # replicas =0 you do not get a container - serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) - get_dagster_logger().info(str(client.configs.list())) - # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) - get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") - get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) - configs = [gleaner,nabu] - # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - service = client.services.create( - image, - args=command, - env= env_vars, - name=name , - networks= container_context.networks if len(container_context.networks) else None, - restart_policy = restart_policy, - mode=serivce_mode, - workdir=workingdir, - configs=configs - ) - wait_count =0 - while True: - time.sleep(1) - wait_count+=1 - get_dagster_logger().debug(str(service.tasks())) - - container_task = service.tasks(filters={"service":name}) - - containers = client.containers.list(all=True, filters={"label":f"com.docker.swarm.service.name={name}"}) - if len(containers)> 0: - break - if wait_count > 12: - raise f"Container for service {name} not starting" - - get_dagster_logger().info(len(containers)) - return service, containers[0] - - - - -def gleanerio(context, mode, source): - ## ------------ Create - returnCode = 0 - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE =GLEANERIO_GLEANER_IMAGE - - # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - ARGS = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "prune"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = GLEANERIO_NABU_IMAGE - - ARGS = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"sch_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - - returnCode = 1 - return returnCode - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = ARGS -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - - - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - # "volumes": { - # f"{GLEANER_CONFIG_VOLUME}": - # {'bind': '/configs', 'mode': 'rw'} - # }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_service: ") - service, container = _create_service( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, - workingdir=data["WorkingDir"] - ) - except Exception as err: - raise err - - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - - - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - get_dagster_logger().info(f"watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"watch container logs failed other issue:{repr(ex)} ") - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + '/containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - if (service): - service.remove() - get_dagster_logger().info(f"Service Remove: {service.name}") - else: - get_dagster_logger().info(f"Service Not created, so not removed.") - - else: - get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") - - if (returnCode != 0): - get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") - raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") - return returnCode - -@op -def wade9_getImage(context): - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(run_container_context) - client.images.pull(GLEANERIO_GLEANER_IMAGE) - client.images.pull(GLEANERIO_NABU_IMAGE) -@op(ins={"start": In(Nothing)}) -def wade9_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "wade9") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade9_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "wade9") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prune returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade9_nabuprov(context): - returned_value = gleanerio(context,("prov"), "wade9") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu prov returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade9_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "wade9") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu org load returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade9_naburelease(context): - returned_value = gleanerio(context,("release"), "wade9") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"nabu release returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade9_uploadrelease(context): - returned_value = postRelease("wade9") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"upload release returned {r} ") - return - - -@op(ins={"start": In(Nothing)}) -def wade9_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade9") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade9" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - get_dagster_logger().info(f"missing s3 report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade9_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade9") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade9" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - get_dagster_logger().info(f"missing graph report returned {r} ") - return -@op(ins={"start": In(Nothing)}) -def wade9_graph_reports(context) : - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade9") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade9" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade9_identifier_stats(context): - source = getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename="wade9") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade9" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") - return - -@op(ins={"start": In(Nothing)}) -def wade9_bucket_urls(context): - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "wade9" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - get_dagster_logger().info(f"bucker urls report returned {r} ") - return - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="wade9"): -# -# source= getSitemapSourcesFromGleaner(GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="wade9" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_wade9(): - containers = wade9_getImage() - harvest = wade9_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = wade9_missingreport_s3(start=harvest) - report_idstat = wade9_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = wade9_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="wade9") - load_release = wade9_naburelease(start=harvest) - load_uploadrelease = wade9_uploadrelease(start=load_release) - - load_prune = wade9_nabu_prune(start=load_uploadrelease) - load_prov = wade9_nabuprov(start=load_prune) - load_org = wade9_nabuorg(start=load_prov) - -# run after load - report_msgraph=wade9_missingreport_graph(start=load_org) - report_graph=wade9_graph_reports(start=report_msgraph) - - - - diff --git a/dagster/implnets/generatedCode/implnet-iow/output/repositories/repository.py b/dagster/implnets/generatedCode/implnet-iow/output/repositories/repository.py deleted file mode 100644 index f7f74595..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/repositories/repository.py +++ /dev/null @@ -1,371 +0,0 @@ -from dagster import repository -from jobs.implnet_jobs_nwisgw20 import implnet_job_nwisgw20 -from sch.implnet_sch_nwisgw20 import implnet_sch_nwisgw20 -from jobs.implnet_jobs_nwisgw22 import implnet_job_nwisgw22 -from sch.implnet_sch_nwisgw22 import implnet_sch_nwisgw22 -from jobs.implnet_jobs_nwisgw16 import implnet_job_nwisgw16 -from sch.implnet_sch_nwisgw16 import implnet_sch_nwisgw16 -from jobs.implnet_jobs_nwisgw12 import implnet_job_nwisgw12 -from sch.implnet_sch_nwisgw12 import implnet_sch_nwisgw12 -from jobs.implnet_jobs_nwisgw25 import implnet_job_nwisgw25 -from sch.implnet_sch_nwisgw25 import implnet_sch_nwisgw25 -from jobs.implnet_jobs_nwisgw14 import implnet_job_nwisgw14 -from sch.implnet_sch_nwisgw14 import implnet_sch_nwisgw14 -from jobs.implnet_jobs_nwisgw23 import implnet_job_nwisgw23 -from sch.implnet_sch_nwisgw23 import implnet_sch_nwisgw23 -from jobs.implnet_jobs_nwisgw10 import implnet_job_nwisgw10 -from sch.implnet_sch_nwisgw10 import implnet_sch_nwisgw10 -from jobs.implnet_jobs_nwisgw15 import implnet_job_nwisgw15 -from sch.implnet_sch_nwisgw15 import implnet_sch_nwisgw15 -from jobs.implnet_jobs_nwisgw2 import implnet_job_nwisgw2 -from sch.implnet_sch_nwisgw2 import implnet_sch_nwisgw2 -from jobs.implnet_jobs_nwisgw24 import implnet_job_nwisgw24 -from sch.implnet_sch_nwisgw24 import implnet_sch_nwisgw24 -from jobs.implnet_jobs_nwisgw9 import implnet_job_nwisgw9 -from sch.implnet_sch_nwisgw9 import implnet_sch_nwisgw9 -from jobs.implnet_jobs_nwisgw19 import implnet_job_nwisgw19 -from sch.implnet_sch_nwisgw19 import implnet_sch_nwisgw19 -from jobs.implnet_jobs_nwisgw28 import implnet_job_nwisgw28 -from sch.implnet_sch_nwisgw28 import implnet_sch_nwisgw28 -from jobs.implnet_jobs_nwisgw26 import implnet_job_nwisgw26 -from sch.implnet_sch_nwisgw26 import implnet_sch_nwisgw26 -from jobs.implnet_jobs_nwisgw5 import implnet_job_nwisgw5 -from sch.implnet_sch_nwisgw5 import implnet_sch_nwisgw5 -from jobs.implnet_jobs_nwisgw13 import implnet_job_nwisgw13 -from sch.implnet_sch_nwisgw13 import implnet_sch_nwisgw13 -from jobs.implnet_jobs_nwisgw6 import implnet_job_nwisgw6 -from sch.implnet_sch_nwisgw6 import implnet_sch_nwisgw6 -from jobs.implnet_jobs_nwisgw3 import implnet_job_nwisgw3 -from sch.implnet_sch_nwisgw3 import implnet_sch_nwisgw3 -from jobs.implnet_jobs_nwisgw4 import implnet_job_nwisgw4 -from sch.implnet_sch_nwisgw4 import implnet_sch_nwisgw4 -from jobs.implnet_jobs_nwisgw1 import implnet_job_nwisgw1 -from sch.implnet_sch_nwisgw1 import implnet_sch_nwisgw1 -from jobs.implnet_jobs_nwisgw21 import implnet_job_nwisgw21 -from sch.implnet_sch_nwisgw21 import implnet_sch_nwisgw21 -from jobs.implnet_jobs_nwisgw27 import implnet_job_nwisgw27 -from sch.implnet_sch_nwisgw27 import implnet_sch_nwisgw27 -from jobs.implnet_jobs_nwisgw8 import implnet_job_nwisgw8 -from sch.implnet_sch_nwisgw8 import implnet_sch_nwisgw8 -from jobs.implnet_jobs_nwisgw17 import implnet_job_nwisgw17 -from sch.implnet_sch_nwisgw17 import implnet_sch_nwisgw17 -from jobs.implnet_jobs_nwisgw18 import implnet_job_nwisgw18 -from sch.implnet_sch_nwisgw18 import implnet_sch_nwisgw18 -from jobs.implnet_jobs_nwisgw7 import implnet_job_nwisgw7 -from sch.implnet_sch_nwisgw7 import implnet_sch_nwisgw7 -from jobs.implnet_jobs_nwisgw11 import implnet_job_nwisgw11 -from sch.implnet_sch_nwisgw11 import implnet_sch_nwisgw11 -from jobs.implnet_jobs_nwisgw0 import implnet_job_nwisgw0 -from sch.implnet_sch_nwisgw0 import implnet_sch_nwisgw0 -from jobs.implnet_jobs_nwissite1 import implnet_job_nwissite1 -from sch.implnet_sch_nwissite1 import implnet_sch_nwissite1 -from jobs.implnet_jobs_nwissite3 import implnet_job_nwissite3 -from sch.implnet_sch_nwissite3 import implnet_sch_nwissite3 -from jobs.implnet_jobs_nwissite0 import implnet_job_nwissite0 -from sch.implnet_sch_nwissite0 import implnet_sch_nwissite0 -from jobs.implnet_jobs_nwissite2 import implnet_job_nwissite2 -from sch.implnet_sch_nwissite2 import implnet_sch_nwissite2 -from jobs.implnet_jobs_gfv11pois1 import implnet_job_gfv11pois1 -from sch.implnet_sch_gfv11pois1 import implnet_sch_gfv11pois1 -from jobs.implnet_jobs_gfv11pois0 import implnet_job_gfv11pois0 -from sch.implnet_sch_gfv11pois0 import implnet_sch_gfv11pois0 -from jobs.implnet_jobs_hydrologicunit0 import implnet_job_hydrologicunit0 -from sch.implnet_sch_hydrologicunit0 import implnet_sch_hydrologicunit0 -from jobs.implnet_jobs_damspids0 import implnet_job_damspids0 -from sch.implnet_sch_damspids0 import implnet_sch_damspids0 -from jobs.implnet_jobs_cuahsihishydrodataczhrids0 import implnet_job_cuahsihishydrodataczhrids0 -from sch.implnet_sch_cuahsihishydrodataczhrids0 import implnet_sch_cuahsihishydrodataczhrids0 -from jobs.implnet_jobs_cuahsihisnooksackmicroclimatenetworkids0 import implnet_job_cuahsihisnooksackmicroclimatenetworkids0 -from sch.implnet_sch_cuahsihisnooksackmicroclimatenetworkids0 import implnet_sch_cuahsihisnooksackmicroclimatenetworkids0 -from jobs.implnet_jobs_cuahsihisneonids0 import implnet_job_cuahsihisneonids0 -from sch.implnet_sch_cuahsihisneonids0 import implnet_sch_cuahsihisneonids0 -from jobs.implnet_jobs_cuahsihisglobalriversobservatoryids0 import implnet_job_cuahsihisglobalriversobservatoryids0 -from sch.implnet_sch_cuahsihisglobalriversobservatoryids0 import implnet_sch_cuahsihisglobalriversobservatoryids0 -from jobs.implnet_jobs_cuahsihistncwaterdataids0 import implnet_job_cuahsihistncwaterdataids0 -from sch.implnet_sch_cuahsihistncwaterdataids0 import implnet_sch_cuahsihistncwaterdataids0 -from jobs.implnet_jobs_cuahsihisscotlandnwisids0 import implnet_job_cuahsihisscotlandnwisids0 -from sch.implnet_sch_cuahsihisscotlandnwisids0 import implnet_sch_cuahsihisscotlandnwisids0 -from jobs.implnet_jobs_cuahsihisczoboulderids0 import implnet_job_cuahsihisczoboulderids0 -from sch.implnet_sch_cuahsihisczoboulderids0 import implnet_sch_cuahsihisczoboulderids0 -from jobs.implnet_jobs_cuahsihisyosemitehydroclimatenetworkids0 import implnet_job_cuahsihisyosemitehydroclimatenetworkids0 -from sch.implnet_sch_cuahsihisyosemitehydroclimatenetworkids0 import implnet_sch_cuahsihisyosemitehydroclimatenetworkids0 -from jobs.implnet_jobs_cuahsihismuddyriverids0 import implnet_job_cuahsihismuddyriverids0 -from sch.implnet_sch_cuahsihismuddyriverids0 import implnet_sch_cuahsihismuddyriverids0 -from jobs.implnet_jobs_cuahsihisczomercedids0 import implnet_job_cuahsihisczomercedids0 -from sch.implnet_sch_cuahsihisczomercedids0 import implnet_sch_cuahsihisczomercedids0 -from jobs.implnet_jobs_cuahsihisghcnids0 import implnet_job_cuahsihisghcnids0 -from sch.implnet_sch_cuahsihisghcnids0 import implnet_sch_cuahsihisghcnids0 -from jobs.implnet_jobs_cuahsihismmaatacamaids0 import implnet_job_cuahsihismmaatacamaids0 -from sch.implnet_sch_cuahsihismmaatacamaids0 import implnet_sch_cuahsihismmaatacamaids0 -from jobs.implnet_jobs_cuahsihisumbcwqids0 import implnet_job_cuahsihisumbcwqids0 -from sch.implnet_sch_cuahsihisumbcwqids0 import implnet_sch_cuahsihisumbcwqids0 -from jobs.implnet_jobs_cuahsihisgleonlakeannieids0 import implnet_job_cuahsihisgleonlakeannieids0 -from sch.implnet_sch_cuahsihisgleonlakeannieids0 import implnet_sch_cuahsihisgleonlakeannieids0 -from jobs.implnet_jobs_cuahsihisluwlids0 import implnet_job_cuahsihisluwlids0 -from sch.implnet_sch_cuahsihisluwlids0 import implnet_sch_cuahsihisluwlids0 -from jobs.implnet_jobs_cuahsihiscedarriverids0 import implnet_job_cuahsihiscedarriverids0 -from sch.implnet_sch_cuahsihiscedarriverids0 import implnet_sch_cuahsihiscedarriverids0 -from jobs.implnet_jobs_cuahsihisccbepdapids0 import implnet_job_cuahsihisccbepdapids0 -from sch.implnet_sch_cuahsihisccbepdapids0 import implnet_sch_cuahsihisccbepdapids0 -from jobs.implnet_jobs_cuahsihiskansasweatherdataids0 import implnet_job_cuahsihiskansasweatherdataids0 -from sch.implnet_sch_cuahsihiskansasweatherdataids0 import implnet_sch_cuahsihiskansasweatherdataids0 -from jobs.implnet_jobs_cuahsihisodmkentstateids0 import implnet_job_cuahsihisodmkentstateids0 -from sch.implnet_sch_cuahsihisodmkentstateids0 import implnet_sch_cuahsihisodmkentstateids0 -from jobs.implnet_jobs_cuahsihisgleondorsetids0 import implnet_job_cuahsihisgleondorsetids0 -from sch.implnet_sch_cuahsihisgleondorsetids0 import implnet_sch_cuahsihisgleondorsetids0 -from jobs.implnet_jobs_cuahsihisclarksburgspids0 import implnet_job_cuahsihisclarksburgspids0 -from sch.implnet_sch_cuahsihisclarksburgspids0 import implnet_sch_cuahsihisclarksburgspids0 -from jobs.implnet_jobs_cuahsihiscrwaids0 import implnet_job_cuahsihiscrwaids0 -from sch.implnet_sch_cuahsihiscrwaids0 import implnet_sch_cuahsihiscrwaids0 -from jobs.implnet_jobs_cuahsihiscuisoids0 import implnet_job_cuahsihiscuisoids0 -from sch.implnet_sch_cuahsihiscuisoids0 import implnet_sch_cuahsihiscuisoids0 -from jobs.implnet_jobs_cuahsihisprovorivergamutids0 import implnet_job_cuahsihisprovorivergamutids0 -from sch.implnet_sch_cuahsihisprovorivergamutids0 import implnet_sch_cuahsihisprovorivergamutids0 -from jobs.implnet_jobs_cuahsihisirwaids0 import implnet_job_cuahsihisirwaids0 -from sch.implnet_sch_cuahsihisirwaids0 import implnet_sch_cuahsihisirwaids0 -from jobs.implnet_jobs_cuahsihisczoluquilloids0 import implnet_job_cuahsihisczoluquilloids0 -from sch.implnet_sch_cuahsihisczoluquilloids0 import implnet_sch_cuahsihisczoluquilloids0 -from jobs.implnet_jobs_cuahsihistuolumnemdwids0 import implnet_job_cuahsihistuolumnemdwids0 -from sch.implnet_sch_cuahsihistuolumnemdwids0 import implnet_sch_cuahsihistuolumnemdwids0 -from jobs.implnet_jobs_cuahsihisrmblids0 import implnet_job_cuahsihisrmblids0 -from sch.implnet_sch_cuahsihisrmblids0 import implnet_sch_cuahsihisrmblids0 -from jobs.implnet_jobs_cuahsihispanolaodmids0 import implnet_job_cuahsihispanolaodmids0 -from sch.implnet_sch_cuahsihispanolaodmids0 import implnet_sch_cuahsihispanolaodmids0 -from jobs.implnet_jobs_cuahsihisnewnids0 import implnet_job_cuahsihisnewnids0 -from sch.implnet_sch_cuahsihisnewnids0 import implnet_sch_cuahsihisnewnids0 -from jobs.implnet_jobs_cuahsihisczoudelids0 import implnet_job_cuahsihisczoudelids0 -from sch.implnet_sch_cuahsihisczoudelids0 import implnet_sch_cuahsihisczoudelids0 -from jobs.implnet_jobs_cuahsihisfarmrwaids0 import implnet_job_cuahsihisfarmrwaids0 -from sch.implnet_sch_cuahsihisfarmrwaids0 import implnet_sch_cuahsihisfarmrwaids0 -from jobs.implnet_jobs_cuahsihisskcmilltownids0 import implnet_job_cuahsihisskcmilltownids0 -from sch.implnet_sch_cuahsihisskcmilltownids0 import implnet_sch_cuahsihisskcmilltownids0 -from jobs.implnet_jobs_cuahsihisumbcgwids0 import implnet_job_cuahsihisumbcgwids0 -from sch.implnet_sch_cuahsihisumbcgwids0 import implnet_sch_cuahsihisumbcgwids0 -from jobs.implnet_jobs_cuahsihisshalenetworkodmids0 import implnet_job_cuahsihisshalenetworkodmids0 -from sch.implnet_sch_cuahsihisshalenetworkodmids0 import implnet_sch_cuahsihisshalenetworkodmids0 -from jobs.implnet_jobs_cuahsihisnevadosids0 import implnet_job_cuahsihisnevadosids0 -from sch.implnet_sch_cuahsihisnevadosids0 import implnet_sch_cuahsihisnevadosids0 -from jobs.implnet_jobs_cuahsihisweiherbachids0 import implnet_job_cuahsihisweiherbachids0 -from sch.implnet_sch_cuahsihisweiherbachids0 import implnet_sch_cuahsihisweiherbachids0 -from jobs.implnet_jobs_cuahsihismazarriverprojectids0 import implnet_job_cuahsihismazarriverprojectids0 -from sch.implnet_sch_cuahsihismazarriverprojectids0 import implnet_sch_cuahsihismazarriverprojectids0 -from jobs.implnet_jobs_cuahsihisgleonsunapeeids0 import implnet_job_cuahsihisgleonsunapeeids0 -from sch.implnet_sch_cuahsihisgleonsunapeeids0 import implnet_sch_cuahsihisgleonsunapeeids0 -from jobs.implnet_jobs_cuahsihisorsancohabids0 import implnet_job_cuahsihisorsancohabids0 -from sch.implnet_sch_cuahsihisorsancohabids0 import implnet_sch_cuahsihisorsancohabids0 -from jobs.implnet_jobs_cuahsihismwraids0 import implnet_job_cuahsihismwraids0 -from sch.implnet_sch_cuahsihismwraids0 import implnet_sch_cuahsihismwraids0 -from jobs.implnet_jobs_cuahsihismaaeriids0 import implnet_job_cuahsihismaaeriids0 -from sch.implnet_sch_cuahsihismaaeriids0 import implnet_sch_cuahsihismaaeriids0 -from jobs.implnet_jobs_cuahsihisnceiww2ids0 import implnet_job_cuahsihisnceiww2ids0 -from sch.implnet_sch_cuahsihisnceiww2ids0 import implnet_sch_cuahsihisnceiww2ids0 -from jobs.implnet_jobs_cuahsihistarlandwaterqualityids0 import implnet_job_cuahsihistarlandwaterqualityids0 -from sch.implnet_sch_cuahsihistarlandwaterqualityids0 import implnet_sch_cuahsihistarlandwaterqualityids0 -from jobs.implnet_jobs_cuahsihislczoodm2ids0 import implnet_job_cuahsihislczoodm2ids0 -from sch.implnet_sch_cuahsihislczoodm2ids0 import implnet_sch_cuahsihislczoodm2ids0 -from jobs.implnet_jobs_cuahsihiscocorahsids0 import implnet_job_cuahsihiscocorahsids0 -from sch.implnet_sch_cuahsihiscocorahsids0 import implnet_sch_cuahsihiscocorahsids0 -from jobs.implnet_jobs_cuahsihisparalanaturalezaids0 import implnet_job_cuahsihisparalanaturalezaids0 -from sch.implnet_sch_cuahsihisparalanaturalezaids0 import implnet_sch_cuahsihisparalanaturalezaids0 -from jobs.implnet_jobs_cuahsihisczocatalinaids0 import implnet_job_cuahsihisczocatalinaids0 -from sch.implnet_sch_cuahsihisczocatalinaids0 import implnet_sch_cuahsihisczocatalinaids0 -from jobs.implnet_jobs_cuahsihisieeratwilkesuniversityids0 import implnet_job_cuahsihisieeratwilkesuniversityids0 -from sch.implnet_sch_cuahsihisieeratwilkesuniversityids0 import implnet_sch_cuahsihisieeratwilkesuniversityids0 -from jobs.implnet_jobs_cuahsihismudlakeids0 import implnet_job_cuahsihismudlakeids0 -from sch.implnet_sch_cuahsihismudlakeids0 import implnet_sch_cuahsihismudlakeids0 -from jobs.implnet_jobs_cuahsihismwdisids0 import implnet_job_cuahsihismwdisids0 -from sch.implnet_sch_cuahsihismwdisids0 import implnet_sch_cuahsihismwdisids0 -from jobs.implnet_jobs_cuahsihisloganriverids0 import implnet_job_cuahsihisloganriverids0 -from sch.implnet_sch_cuahsihisloganriverids0 import implnet_sch_cuahsihisloganriverids0 -from jobs.implnet_jobs_cuahsihisscanids0 import implnet_job_cuahsihisscanids0 -from sch.implnet_sch_cuahsihisscanids0 import implnet_sch_cuahsihisscanids0 -from jobs.implnet_jobs_cuahsihisnashrwaids0 import implnet_job_cuahsihisnashrwaids0 -from sch.implnet_sch_cuahsihisnashrwaids0 import implnet_sch_cuahsihisnashrwaids0 -from jobs.implnet_jobs_cuahsihismobilecrowdhydrologyids0 import implnet_job_cuahsihismobilecrowdhydrologyids0 -from sch.implnet_sch_cuahsihismobilecrowdhydrologyids0 import implnet_sch_cuahsihismobilecrowdhydrologyids0 -from jobs.implnet_jobs_cuahsihisandrewsforestlterids0 import implnet_job_cuahsihisandrewsforestlterids0 -from sch.implnet_sch_cuahsihisandrewsforestlterids0 import implnet_sch_cuahsihisandrewsforestlterids0 -from jobs.implnet_jobs_cuahsihisloganrivergamutids0 import implnet_job_cuahsihisloganrivergamutids0 -from sch.implnet_sch_cuahsihisloganrivergamutids0 import implnet_sch_cuahsihisloganrivergamutids0 -from jobs.implnet_jobs_cuahsihislittlebearriverids0 import implnet_job_cuahsihislittlebearriverids0 -from sch.implnet_sch_cuahsihislittlebearriverids0 import implnet_sch_cuahsihislittlebearriverids0 -from jobs.implnet_jobs_cuahsihislterntlwoodruffids0 import implnet_job_cuahsihislterntlwoodruffids0 -from sch.implnet_sch_cuahsihislterntlwoodruffids0 import implnet_sch_cuahsihislterntlwoodruffids0 -from jobs.implnet_jobs_cuahsihissagehencreekids0 import implnet_job_cuahsihissagehencreekids0 -from sch.implnet_sch_cuahsihissagehencreekids0 import implnet_sch_cuahsihissagehencreekids0 -from jobs.implnet_jobs_cuahsihisshalenetworkodmids1 import implnet_job_cuahsihisshalenetworkodmids1 -from sch.implnet_sch_cuahsihisshalenetworkodmids1 import implnet_sch_cuahsihisshalenetworkodmids1 -from jobs.implnet_jobs_cuahsihisfrcwqmids0 import implnet_job_cuahsihisfrcwqmids0 -from sch.implnet_sch_cuahsihisfrcwqmids0 import implnet_sch_cuahsihisfrcwqmids0 -from jobs.implnet_jobs_cuahsihishydrodataczdids0 import implnet_job_cuahsihishydrodataczdids0 -from sch.implnet_sch_cuahsihishydrodataczdids0 import implnet_sch_cuahsihishydrodataczdids0 -from jobs.implnet_jobs_cuahsihisdrwiids0 import implnet_job_cuahsihisdrwiids0 -from sch.implnet_sch_cuahsihisdrwiids0 import implnet_sch_cuahsihisdrwiids0 -from jobs.implnet_jobs_cuahsihisubwpadids0 import implnet_job_cuahsihisubwpadids0 -from sch.implnet_sch_cuahsihisubwpadids0 import implnet_sch_cuahsihisubwpadids0 -from jobs.implnet_jobs_cuahsihistrwaids0 import implnet_job_cuahsihistrwaids0 -from sch.implnet_sch_cuahsihistrwaids0 import implnet_sch_cuahsihistrwaids0 -from jobs.implnet_jobs_cuahsihisredbuttecreekgamutids0 import implnet_job_cuahsihisredbuttecreekgamutids0 -from sch.implnet_sch_cuahsihisredbuttecreekgamutids0 import implnet_sch_cuahsihisredbuttecreekgamutids0 -from jobs.implnet_jobs_cuahsihisglacialridgeids0 import implnet_job_cuahsihisglacialridgeids0 -from sch.implnet_sch_cuahsihisglacialridgeids0 import implnet_sch_cuahsihisglacialridgeids0 -from jobs.implnet_jobs_cuahsihisfcelterids0 import implnet_job_cuahsihisfcelterids0 -from sch.implnet_sch_cuahsihisfcelterids0 import implnet_sch_cuahsihisfcelterids0 -from jobs.implnet_jobs_cuahsihisczoarizids0 import implnet_job_cuahsihisczoarizids0 -from sch.implnet_sch_cuahsihisczoarizids0 import implnet_sch_cuahsihisczoarizids0 -from jobs.implnet_jobs_cuahsihiscalvinhhsids0 import implnet_job_cuahsihiscalvinhhsids0 -from sch.implnet_sch_cuahsihiscalvinhhsids0 import implnet_sch_cuahsihiscalvinhhsids0 -from jobs.implnet_jobs_cuahsihissnotelids0 import implnet_job_cuahsihissnotelids0 -from sch.implnet_sch_cuahsihissnotelids0 import implnet_sch_cuahsihissnotelids0 -from jobs.implnet_jobs_cuahsihisnevcanids0 import implnet_job_cuahsihisnevcanids0 -from sch.implnet_sch_cuahsihisnevcanids0 import implnet_sch_cuahsihisnevcanids0 -from jobs.implnet_jobs_cuahsihisczopsuids0 import implnet_job_cuahsihisczopsuids0 -from sch.implnet_sch_cuahsihisczopsuids0 import implnet_sch_cuahsihisczopsuids0 -from jobs.implnet_jobs_cuahsihisbrazilucbids0 import implnet_job_cuahsihisbrazilucbids0 -from sch.implnet_sch_cuahsihisbrazilucbids0 import implnet_sch_cuahsihisbrazilucbids0 -from jobs.implnet_jobs_cuahsihisgleonauburnids0 import implnet_job_cuahsihisgleonauburnids0 -from sch.implnet_sch_cuahsihisgleonauburnids0 import implnet_sch_cuahsihisgleonauburnids0 -from jobs.implnet_jobs_cuahsihislaselvastreamdischargeids0 import implnet_job_cuahsihislaselvastreamdischargeids0 -from sch.implnet_sch_cuahsihislaselvastreamdischargeids0 import implnet_sch_cuahsihislaselvastreamdischargeids0 -from jobs.implnet_jobs_cuahsihisisbenaids0 import implnet_job_cuahsihisisbenaids0 -from sch.implnet_sch_cuahsihisisbenaids0 import implnet_sch_cuahsihisisbenaids0 -from jobs.implnet_jobs_cuahsihisswedishmonitoringdataids0 import implnet_job_cuahsihisswedishmonitoringdataids0 -from sch.implnet_sch_cuahsihisswedishmonitoringdataids0 import implnet_sch_cuahsihisswedishmonitoringdataids0 -from jobs.implnet_jobs_cuahsihisunhsnowids0 import implnet_job_cuahsihisunhsnowids0 -from sch.implnet_sch_cuahsihisunhsnowids0 import implnet_sch_cuahsihisunhsnowids0 -from jobs.implnet_jobs_cuahsihishassbergeids0 import implnet_job_cuahsihishassbergeids0 -from sch.implnet_sch_cuahsihishassbergeids0 import implnet_sch_cuahsihishassbergeids0 -from jobs.implnet_jobs_cuahsihisnhgswofids0 import implnet_job_cuahsihisnhgswofids0 -from sch.implnet_sch_cuahsihisnhgswofids0 import implnet_sch_cuahsihisnhgswofids0 -from jobs.implnet_jobs_cuahsihisgonggaids0 import implnet_job_cuahsihisgonggaids0 -from sch.implnet_sch_cuahsihisgonggaids0 import implnet_sch_cuahsihisgonggaids0 -from jobs.implnet_jobs_cuahsihismopexids0 import implnet_job_cuahsihismopexids0 -from sch.implnet_sch_cuahsihismopexids0 import implnet_sch_cuahsihismopexids0 -from jobs.implnet_jobs_cagagespids0 import implnet_job_cagagespids0 -from sch.implnet_sch_cagagespids0 import implnet_sch_cagagespids0 -from jobs.implnet_jobs_sechydrgreg0 import implnet_job_sechydrgreg0 -from sch.implnet_sch_sechydrgreg0 import implnet_sch_sechydrgreg0 -from jobs.implnet_jobs_counties0 import implnet_job_counties0 -from sch.implnet_sch_counties0 import implnet_sch_counties0 -from jobs.implnet_jobs_pws0 import implnet_job_pws0 -from sch.implnet_sch_pws0 import implnet_sch_pws0 -from jobs.implnet_jobs_hu060 import implnet_job_hu060 -from sch.implnet_sch_hu060 import implnet_sch_hu060 -from jobs.implnet_jobs_nataq0 import implnet_job_nataq0 -from sch.implnet_sch_nataq0 import implnet_sch_nataq0 -from jobs.implnet_jobs_cbsa0 import implnet_job_cbsa0 -from sch.implnet_sch_cbsa0 import implnet_sch_cbsa0 -from jobs.implnet_jobs_hu080 import implnet_job_hu080 -from sch.implnet_sch_hu080 import implnet_sch_hu080 -from jobs.implnet_jobs_hu040 import implnet_job_hu040 -from sch.implnet_sch_hu040 import implnet_sch_hu040 -from jobs.implnet_jobs_princiaq0 import implnet_job_princiaq0 -from sch.implnet_sch_princiaq0 import implnet_sch_princiaq0 -from jobs.implnet_jobs_refgage0 import implnet_job_refgage0 -from sch.implnet_sch_refgage0 import implnet_sch_refgage0 -from jobs.implnet_jobs_refgage3 import implnet_job_refgage3 -from sch.implnet_sch_refgage3 import implnet_sch_refgage3 -from jobs.implnet_jobs_refgage2 import implnet_job_refgage2 -from sch.implnet_sch_refgage2 import implnet_sch_refgage2 -from jobs.implnet_jobs_refgage1 import implnet_job_refgage1 -from sch.implnet_sch_refgage1 import implnet_sch_refgage1 -from jobs.implnet_jobs_dams0 import implnet_job_dams0 -from sch.implnet_sch_dams0 import implnet_sch_dams0 -from jobs.implnet_jobs_dams1 import implnet_job_dams1 -from sch.implnet_sch_dams1 import implnet_sch_dams1 -from jobs.implnet_jobs_ua100 import implnet_job_ua100 -from sch.implnet_sch_ua100 import implnet_sch_ua100 -from jobs.implnet_jobs_states0 import implnet_job_states0 -from sch.implnet_sch_states0 import implnet_sch_states0 -from jobs.implnet_jobs_hu100 import implnet_job_hu100 -from sch.implnet_sch_hu100 import implnet_sch_hu100 -from jobs.implnet_jobs_aiannh0 import implnet_job_aiannh0 -from sch.implnet_sch_aiannh0 import implnet_sch_aiannh0 -from jobs.implnet_jobs_hu020 import implnet_job_hu020 -from sch.implnet_sch_hu020 import implnet_sch_hu020 -from jobs.implnet_jobs_mainstems0 import implnet_job_mainstems0 -from sch.implnet_sch_mainstems0 import implnet_sch_mainstems0 -from jobs.implnet_jobs_places0 import implnet_job_places0 -from sch.implnet_sch_places0 import implnet_sch_places0 -from jobs.implnet_jobs_hmw0 import implnet_job_hmw0 -from sch.implnet_sch_hmw0 import implnet_sch_hmw0 -from jobs.implnet_jobs_hmw1 import implnet_job_hmw1 -from sch.implnet_sch_hmw1 import implnet_sch_hmw1 -from jobs.implnet_jobs_huc12pp0 import implnet_job_huc12pp0 -from sch.implnet_sch_huc12pp0 import implnet_sch_huc12pp0 -from jobs.implnet_jobs_huc12pp1 import implnet_job_huc12pp1 -from sch.implnet_sch_huc12pp1 import implnet_sch_huc12pp1 -from jobs.implnet_jobs_nmwdiose3 import implnet_job_nmwdiose3 -from sch.implnet_sch_nmwdiose3 import implnet_sch_nmwdiose3 -from jobs.implnet_jobs_nmwdiose2 import implnet_job_nmwdiose2 -from sch.implnet_sch_nmwdiose2 import implnet_sch_nmwdiose2 -from jobs.implnet_jobs_nmwdiose0 import implnet_job_nmwdiose0 -from sch.implnet_sch_nmwdiose0 import implnet_sch_nmwdiose0 -from jobs.implnet_jobs_nmwdiose4 import implnet_job_nmwdiose4 -from sch.implnet_sch_nmwdiose4 import implnet_sch_nmwdiose4 -from jobs.implnet_jobs_nmwdiose1 import implnet_job_nmwdiose1 -from sch.implnet_sch_nmwdiose1 import implnet_sch_nmwdiose1 -from jobs.implnet_jobs_nmwdist0 import implnet_job_nmwdist0 -from sch.implnet_sch_nmwdist0 import implnet_sch_nmwdist0 -from jobs.implnet_jobs_selfieids0 import implnet_job_selfieids0 -from sch.implnet_sch_selfieids0 import implnet_sch_selfieids0 -from jobs.implnet_jobs_chyldpilotids0 import implnet_job_chyldpilotids0 -from sch.implnet_sch_chyldpilotids0 import implnet_sch_chyldpilotids0 -from jobs.implnet_jobs_rise0 import implnet_job_rise0 -from sch.implnet_sch_rise0 import implnet_sch_rise0 -from jobs.implnet_jobs_autotest10 import implnet_job_autotest10 -from sch.implnet_sch_autotest10 import implnet_sch_autotest10 -from jobs.implnet_jobs_links0 import implnet_job_links0 -from sch.implnet_sch_links0 import implnet_sch_links0 -from jobs.implnet_jobs_demo0 import implnet_job_demo0 -from sch.implnet_sch_demo0 import implnet_sch_demo0 -from jobs.implnet_jobs_autotest20 import implnet_job_autotest20 -from sch.implnet_sch_autotest20 import implnet_sch_autotest20 -from jobs.implnet_jobs_wade2 import implnet_job_wade2 -from sch.implnet_sch_wade2 import implnet_sch_wade2 -from jobs.implnet_jobs_wade0 import implnet_job_wade0 -from sch.implnet_sch_wade0 import implnet_sch_wade0 -from jobs.implnet_jobs_wade17 import implnet_job_wade17 -from sch.implnet_sch_wade17 import implnet_sch_wade17 -from jobs.implnet_jobs_wade9 import implnet_job_wade9 -from sch.implnet_sch_wade9 import implnet_sch_wade9 -from jobs.implnet_jobs_wade7 import implnet_job_wade7 -from sch.implnet_sch_wade7 import implnet_sch_wade7 -from jobs.implnet_jobs_wade3 import implnet_job_wade3 -from sch.implnet_sch_wade3 import implnet_sch_wade3 -from jobs.implnet_jobs_wade15 import implnet_job_wade15 -from sch.implnet_sch_wade15 import implnet_sch_wade15 -from jobs.implnet_jobs_wade5 import implnet_job_wade5 -from sch.implnet_sch_wade5 import implnet_sch_wade5 -from jobs.implnet_jobs_wade10 import implnet_job_wade10 -from sch.implnet_sch_wade10 import implnet_sch_wade10 -from jobs.implnet_jobs_wade14 import implnet_job_wade14 -from sch.implnet_sch_wade14 import implnet_sch_wade14 -from jobs.implnet_jobs_wade18 import implnet_job_wade18 -from sch.implnet_sch_wade18 import implnet_sch_wade18 -from jobs.implnet_jobs_wade13 import implnet_job_wade13 -from sch.implnet_sch_wade13 import implnet_sch_wade13 -from jobs.implnet_jobs_wade8 import implnet_job_wade8 -from sch.implnet_sch_wade8 import implnet_sch_wade8 -from jobs.implnet_jobs_wade19 import implnet_job_wade19 -from sch.implnet_sch_wade19 import implnet_sch_wade19 -from jobs.implnet_jobs_wade12 import implnet_job_wade12 -from sch.implnet_sch_wade12 import implnet_sch_wade12 -from jobs.implnet_jobs_wade4 import implnet_job_wade4 -from sch.implnet_sch_wade4 import implnet_sch_wade4 -from jobs.implnet_jobs_wade16 import implnet_job_wade16 -from sch.implnet_sch_wade16 import implnet_sch_wade16 -from jobs.implnet_jobs_wade1 import implnet_job_wade1 -from sch.implnet_sch_wade1 import implnet_sch_wade1 -from jobs.implnet_jobs_wade6 import implnet_job_wade6 -from sch.implnet_sch_wade6 import implnet_sch_wade6 -from jobs.implnet_jobs_wade11 import implnet_job_wade11 -from sch.implnet_sch_wade11 import implnet_sch_wade11 - -@repository -def gleaner(): - jobs = [implnet_job_nwisgw20, implnet_job_nwisgw22, implnet_job_nwisgw16, implnet_job_nwisgw12, implnet_job_nwisgw25, implnet_job_nwisgw14, implnet_job_nwisgw23, implnet_job_nwisgw10, implnet_job_nwisgw15, implnet_job_nwisgw2, implnet_job_nwisgw24, implnet_job_nwisgw9, implnet_job_nwisgw19, implnet_job_nwisgw28, implnet_job_nwisgw26, implnet_job_nwisgw5, implnet_job_nwisgw13, implnet_job_nwisgw6, implnet_job_nwisgw3, implnet_job_nwisgw4, implnet_job_nwisgw1, implnet_job_nwisgw21, implnet_job_nwisgw27, implnet_job_nwisgw8, implnet_job_nwisgw17, implnet_job_nwisgw18, implnet_job_nwisgw7, implnet_job_nwisgw11, implnet_job_nwisgw0, implnet_job_nwissite1, implnet_job_nwissite3, implnet_job_nwissite0, implnet_job_nwissite2, implnet_job_gfv11pois1, implnet_job_gfv11pois0, implnet_job_hydrologicunit0, implnet_job_damspids0, implnet_job_cuahsihishydrodataczhrids0, implnet_job_cuahsihisnooksackmicroclimatenetworkids0, implnet_job_cuahsihisneonids0, implnet_job_cuahsihisglobalriversobservatoryids0, implnet_job_cuahsihistncwaterdataids0, implnet_job_cuahsihisscotlandnwisids0, implnet_job_cuahsihisczoboulderids0, implnet_job_cuahsihisyosemitehydroclimatenetworkids0, implnet_job_cuahsihismuddyriverids0, implnet_job_cuahsihisczomercedids0, implnet_job_cuahsihisghcnids0, implnet_job_cuahsihismmaatacamaids0, implnet_job_cuahsihisumbcwqids0, implnet_job_cuahsihisgleonlakeannieids0, implnet_job_cuahsihisluwlids0, implnet_job_cuahsihiscedarriverids0, implnet_job_cuahsihisccbepdapids0, implnet_job_cuahsihiskansasweatherdataids0, implnet_job_cuahsihisodmkentstateids0, implnet_job_cuahsihisgleondorsetids0, implnet_job_cuahsihisclarksburgspids0, implnet_job_cuahsihiscrwaids0, implnet_job_cuahsihiscuisoids0, implnet_job_cuahsihisprovorivergamutids0, implnet_job_cuahsihisirwaids0, implnet_job_cuahsihisczoluquilloids0, implnet_job_cuahsihistuolumnemdwids0, implnet_job_cuahsihisrmblids0, implnet_job_cuahsihispanolaodmids0, implnet_job_cuahsihisnewnids0, implnet_job_cuahsihisczoudelids0, implnet_job_cuahsihisfarmrwaids0, implnet_job_cuahsihisskcmilltownids0, implnet_job_cuahsihisumbcgwids0, implnet_job_cuahsihisshalenetworkodmids0, implnet_job_cuahsihisnevadosids0, implnet_job_cuahsihisweiherbachids0, implnet_job_cuahsihismazarriverprojectids0, implnet_job_cuahsihisgleonsunapeeids0, implnet_job_cuahsihisorsancohabids0, implnet_job_cuahsihismwraids0, implnet_job_cuahsihismaaeriids0, implnet_job_cuahsihisnceiww2ids0, implnet_job_cuahsihistarlandwaterqualityids0, implnet_job_cuahsihislczoodm2ids0, implnet_job_cuahsihiscocorahsids0, implnet_job_cuahsihisparalanaturalezaids0, implnet_job_cuahsihisczocatalinaids0, implnet_job_cuahsihisieeratwilkesuniversityids0, implnet_job_cuahsihismudlakeids0, implnet_job_cuahsihismwdisids0, implnet_job_cuahsihisloganriverids0, implnet_job_cuahsihisscanids0, implnet_job_cuahsihisnashrwaids0, implnet_job_cuahsihismobilecrowdhydrologyids0, implnet_job_cuahsihisandrewsforestlterids0, implnet_job_cuahsihisloganrivergamutids0, implnet_job_cuahsihislittlebearriverids0, implnet_job_cuahsihislterntlwoodruffids0, implnet_job_cuahsihissagehencreekids0, implnet_job_cuahsihisshalenetworkodmids1, implnet_job_cuahsihisfrcwqmids0, implnet_job_cuahsihishydrodataczdids0, implnet_job_cuahsihisdrwiids0, implnet_job_cuahsihisubwpadids0, implnet_job_cuahsihistrwaids0, implnet_job_cuahsihisredbuttecreekgamutids0, implnet_job_cuahsihisglacialridgeids0, implnet_job_cuahsihisfcelterids0, implnet_job_cuahsihisczoarizids0, implnet_job_cuahsihiscalvinhhsids0, implnet_job_cuahsihissnotelids0, implnet_job_cuahsihisnevcanids0, implnet_job_cuahsihisczopsuids0, implnet_job_cuahsihisbrazilucbids0, implnet_job_cuahsihisgleonauburnids0, implnet_job_cuahsihislaselvastreamdischargeids0, implnet_job_cuahsihisisbenaids0, implnet_job_cuahsihisswedishmonitoringdataids0, implnet_job_cuahsihisunhsnowids0, implnet_job_cuahsihishassbergeids0, implnet_job_cuahsihisnhgswofids0, implnet_job_cuahsihisgonggaids0, implnet_job_cuahsihismopexids0, implnet_job_cagagespids0, implnet_job_sechydrgreg0, implnet_job_counties0, implnet_job_pws0, implnet_job_hu060, implnet_job_nataq0, implnet_job_cbsa0, implnet_job_hu080, implnet_job_hu040, implnet_job_princiaq0, implnet_job_refgage0, implnet_job_refgage3, implnet_job_refgage2, implnet_job_refgage1, implnet_job_dams0, implnet_job_dams1, implnet_job_ua100, implnet_job_states0, implnet_job_hu100, implnet_job_aiannh0, implnet_job_hu020, implnet_job_mainstems0, implnet_job_places0, implnet_job_hmw0, implnet_job_hmw1, implnet_job_huc12pp0, implnet_job_huc12pp1, implnet_job_nmwdiose3, implnet_job_nmwdiose2, implnet_job_nmwdiose0, implnet_job_nmwdiose4, implnet_job_nmwdiose1, implnet_job_nmwdist0, implnet_job_selfieids0, implnet_job_chyldpilotids0, implnet_job_rise0, implnet_job_autotest10, implnet_job_links0, implnet_job_demo0, implnet_job_autotest20, implnet_job_wade2, implnet_job_wade0, implnet_job_wade17, implnet_job_wade9, implnet_job_wade7, implnet_job_wade3, implnet_job_wade15, implnet_job_wade5, implnet_job_wade10, implnet_job_wade14, implnet_job_wade18, implnet_job_wade13, implnet_job_wade8, implnet_job_wade19, implnet_job_wade12, implnet_job_wade4, implnet_job_wade16, implnet_job_wade1, implnet_job_wade6, implnet_job_wade11] - schedules = [implnet_sch_nwisgw20, implnet_sch_nwisgw22, implnet_sch_nwisgw16, implnet_sch_nwisgw12, implnet_sch_nwisgw25, implnet_sch_nwisgw14, implnet_sch_nwisgw23, implnet_sch_nwisgw10, implnet_sch_nwisgw15, implnet_sch_nwisgw2, implnet_sch_nwisgw24, implnet_sch_nwisgw9, implnet_sch_nwisgw19, implnet_sch_nwisgw28, implnet_sch_nwisgw26, implnet_sch_nwisgw5, implnet_sch_nwisgw13, implnet_sch_nwisgw6, implnet_sch_nwisgw3, implnet_sch_nwisgw4, implnet_sch_nwisgw1, implnet_sch_nwisgw21, implnet_sch_nwisgw27, implnet_sch_nwisgw8, implnet_sch_nwisgw17, implnet_sch_nwisgw18, implnet_sch_nwisgw7, implnet_sch_nwisgw11, implnet_sch_nwisgw0, implnet_sch_nwissite1, implnet_sch_nwissite3, implnet_sch_nwissite0, implnet_sch_nwissite2, implnet_sch_gfv11pois1, implnet_sch_gfv11pois0, implnet_sch_hydrologicunit0, implnet_sch_damspids0, implnet_sch_cuahsihishydrodataczhrids0, implnet_sch_cuahsihisnooksackmicroclimatenetworkids0, implnet_sch_cuahsihisneonids0, implnet_sch_cuahsihisglobalriversobservatoryids0, implnet_sch_cuahsihistncwaterdataids0, implnet_sch_cuahsihisscotlandnwisids0, implnet_sch_cuahsihisczoboulderids0, implnet_sch_cuahsihisyosemitehydroclimatenetworkids0, implnet_sch_cuahsihismuddyriverids0, implnet_sch_cuahsihisczomercedids0, implnet_sch_cuahsihisghcnids0, implnet_sch_cuahsihismmaatacamaids0, implnet_sch_cuahsihisumbcwqids0, implnet_sch_cuahsihisgleonlakeannieids0, implnet_sch_cuahsihisluwlids0, implnet_sch_cuahsihiscedarriverids0, implnet_sch_cuahsihisccbepdapids0, implnet_sch_cuahsihiskansasweatherdataids0, implnet_sch_cuahsihisodmkentstateids0, implnet_sch_cuahsihisgleondorsetids0, implnet_sch_cuahsihisclarksburgspids0, implnet_sch_cuahsihiscrwaids0, implnet_sch_cuahsihiscuisoids0, implnet_sch_cuahsihisprovorivergamutids0, implnet_sch_cuahsihisirwaids0, implnet_sch_cuahsihisczoluquilloids0, implnet_sch_cuahsihistuolumnemdwids0, implnet_sch_cuahsihisrmblids0, implnet_sch_cuahsihispanolaodmids0, implnet_sch_cuahsihisnewnids0, implnet_sch_cuahsihisczoudelids0, implnet_sch_cuahsihisfarmrwaids0, implnet_sch_cuahsihisskcmilltownids0, implnet_sch_cuahsihisumbcgwids0, implnet_sch_cuahsihisshalenetworkodmids0, implnet_sch_cuahsihisnevadosids0, implnet_sch_cuahsihisweiherbachids0, implnet_sch_cuahsihismazarriverprojectids0, implnet_sch_cuahsihisgleonsunapeeids0, implnet_sch_cuahsihisorsancohabids0, implnet_sch_cuahsihismwraids0, implnet_sch_cuahsihismaaeriids0, implnet_sch_cuahsihisnceiww2ids0, implnet_sch_cuahsihistarlandwaterqualityids0, implnet_sch_cuahsihislczoodm2ids0, implnet_sch_cuahsihiscocorahsids0, implnet_sch_cuahsihisparalanaturalezaids0, implnet_sch_cuahsihisczocatalinaids0, implnet_sch_cuahsihisieeratwilkesuniversityids0, implnet_sch_cuahsihismudlakeids0, implnet_sch_cuahsihismwdisids0, implnet_sch_cuahsihisloganriverids0, implnet_sch_cuahsihisscanids0, implnet_sch_cuahsihisnashrwaids0, implnet_sch_cuahsihismobilecrowdhydrologyids0, implnet_sch_cuahsihisandrewsforestlterids0, implnet_sch_cuahsihisloganrivergamutids0, implnet_sch_cuahsihislittlebearriverids0, implnet_sch_cuahsihislterntlwoodruffids0, implnet_sch_cuahsihissagehencreekids0, implnet_sch_cuahsihisshalenetworkodmids1, implnet_sch_cuahsihisfrcwqmids0, implnet_sch_cuahsihishydrodataczdids0, implnet_sch_cuahsihisdrwiids0, implnet_sch_cuahsihisubwpadids0, implnet_sch_cuahsihistrwaids0, implnet_sch_cuahsihisredbuttecreekgamutids0, implnet_sch_cuahsihisglacialridgeids0, implnet_sch_cuahsihisfcelterids0, implnet_sch_cuahsihisczoarizids0, implnet_sch_cuahsihiscalvinhhsids0, implnet_sch_cuahsihissnotelids0, implnet_sch_cuahsihisnevcanids0, implnet_sch_cuahsihisczopsuids0, implnet_sch_cuahsihisbrazilucbids0, implnet_sch_cuahsihisgleonauburnids0, implnet_sch_cuahsihislaselvastreamdischargeids0, implnet_sch_cuahsihisisbenaids0, implnet_sch_cuahsihisswedishmonitoringdataids0, implnet_sch_cuahsihisunhsnowids0, implnet_sch_cuahsihishassbergeids0, implnet_sch_cuahsihisnhgswofids0, implnet_sch_cuahsihisgonggaids0, implnet_sch_cuahsihismopexids0, implnet_sch_cagagespids0, implnet_sch_sechydrgreg0, implnet_sch_counties0, implnet_sch_pws0, implnet_sch_hu060, implnet_sch_nataq0, implnet_sch_cbsa0, implnet_sch_hu080, implnet_sch_hu040, implnet_sch_princiaq0, implnet_sch_refgage0, implnet_sch_refgage3, implnet_sch_refgage2, implnet_sch_refgage1, implnet_sch_dams0, implnet_sch_dams1, implnet_sch_ua100, implnet_sch_states0, implnet_sch_hu100, implnet_sch_aiannh0, implnet_sch_hu020, implnet_sch_mainstems0, implnet_sch_places0, implnet_sch_hmw0, implnet_sch_hmw1, implnet_sch_huc12pp0, implnet_sch_huc12pp1, implnet_sch_nmwdiose3, implnet_sch_nmwdiose2, implnet_sch_nmwdiose0, implnet_sch_nmwdiose4, implnet_sch_nmwdiose1, implnet_sch_nmwdist0, implnet_sch_selfieids0, implnet_sch_chyldpilotids0, implnet_sch_rise0, implnet_sch_autotest10, implnet_sch_links0, implnet_sch_demo0, implnet_sch_autotest20, implnet_sch_wade2, implnet_sch_wade0, implnet_sch_wade17, implnet_sch_wade9, implnet_sch_wade7, implnet_sch_wade3, implnet_sch_wade15, implnet_sch_wade5, implnet_sch_wade10, implnet_sch_wade14, implnet_sch_wade18, implnet_sch_wade13, implnet_sch_wade8, implnet_sch_wade19, implnet_sch_wade12, implnet_sch_wade4, implnet_sch_wade16, implnet_sch_wade1, implnet_sch_wade6, implnet_sch_wade11] - - - return jobs + schedules diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_aiannh0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_aiannh0.py deleted file mode 100644 index 015154da..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_aiannh0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_aiannh0 import implnet_job_aiannh0 - -@schedule(cron_schedule="0 8 24 * *", job=implnet_job_aiannh0, execution_timezone="US/Central") -def implnet_sch_aiannh0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_autotest10.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_autotest10.py deleted file mode 100644 index 8a954b82..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_autotest10.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_autotest10 import implnet_job_autotest10 - -@schedule(cron_schedule="0 4 27 * *", job=implnet_job_autotest10, execution_timezone="US/Central") -def implnet_sch_autotest10(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_autotest20.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_autotest20.py deleted file mode 100644 index 9a29f36c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_autotest20.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_autotest20 import implnet_job_autotest20 - -@schedule(cron_schedule="0 16 27 * *", job=implnet_job_autotest20, execution_timezone="US/Central") -def implnet_sch_autotest20(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cagagespids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cagagespids0.py deleted file mode 100644 index e4db94d9..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cagagespids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cagagespids0 import implnet_job_cagagespids0 - -@schedule(cron_schedule="0 4 21 * *", job=implnet_job_cagagespids0, execution_timezone="US/Central") -def implnet_sch_cagagespids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cbsa0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cbsa0.py deleted file mode 100644 index eda1610a..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cbsa0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cbsa0 import implnet_job_cbsa0 - -@schedule(cron_schedule="0 4 22 * *", job=implnet_job_cbsa0, execution_timezone="US/Central") -def implnet_sch_cbsa0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_chyldpilotids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_chyldpilotids0.py deleted file mode 100644 index d27a7587..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_chyldpilotids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_chyldpilotids0 import implnet_job_chyldpilotids0 - -@schedule(cron_schedule="0 20 26 * *", job=implnet_job_chyldpilotids0, execution_timezone="US/Central") -def implnet_sch_chyldpilotids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_counties0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_counties0.py deleted file mode 100644 index 5c4844e4..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_counties0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_counties0 import implnet_job_counties0 - -@schedule(cron_schedule="0 12 21 * *", job=implnet_job_counties0, execution_timezone="US/Central") -def implnet_sch_counties0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisandrewsforestlterids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisandrewsforestlterids0.py deleted file mode 100644 index 5e0b22a2..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisandrewsforestlterids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisandrewsforestlterids0 import implnet_job_cuahsihisandrewsforestlterids0 - -@schedule(cron_schedule="0 8 16 * *", job=implnet_job_cuahsihisandrewsforestlterids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisandrewsforestlterids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisbrazilucbids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisbrazilucbids0.py deleted file mode 100644 index 3456f16b..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisbrazilucbids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisbrazilucbids0 import implnet_job_cuahsihisbrazilucbids0 - -@schedule(cron_schedule="0 12 19 * *", job=implnet_job_cuahsihisbrazilucbids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisbrazilucbids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscalvinhhsids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscalvinhhsids0.py deleted file mode 100644 index 20ae9051..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscalvinhhsids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihiscalvinhhsids0 import implnet_job_cuahsihiscalvinhhsids0 - -@schedule(cron_schedule="0 20 18 * *", job=implnet_job_cuahsihiscalvinhhsids0, execution_timezone="US/Central") -def implnet_sch_cuahsihiscalvinhhsids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisccbepdapids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisccbepdapids0.py deleted file mode 100644 index 8e46588a..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisccbepdapids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisccbepdapids0 import implnet_job_cuahsihisccbepdapids0 - -@schedule(cron_schedule="0 20 9 * *", job=implnet_job_cuahsihisccbepdapids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisccbepdapids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscedarriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscedarriverids0.py deleted file mode 100644 index 36f484f4..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscedarriverids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihiscedarriverids0 import implnet_job_cuahsihiscedarriverids0 - -@schedule(cron_schedule="0 16 9 * *", job=implnet_job_cuahsihiscedarriverids0, execution_timezone="US/Central") -def implnet_sch_cuahsihiscedarriverids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisclarksburgspids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisclarksburgspids0.py deleted file mode 100644 index f5ab8105..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisclarksburgspids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisclarksburgspids0 import implnet_job_cuahsihisclarksburgspids0 - -@schedule(cron_schedule="0 12 10 * *", job=implnet_job_cuahsihisclarksburgspids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisclarksburgspids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscocorahsids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscocorahsids0.py deleted file mode 100644 index 91519c57..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscocorahsids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihiscocorahsids0 import implnet_job_cuahsihiscocorahsids0 - -@schedule(cron_schedule="0 16 14 * *", job=implnet_job_cuahsihiscocorahsids0, execution_timezone="US/Central") -def implnet_sch_cuahsihiscocorahsids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscrwaids0.py deleted file mode 100644 index ea10a440..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscrwaids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihiscrwaids0 import implnet_job_cuahsihiscrwaids0 - -@schedule(cron_schedule="0 16 10 * *", job=implnet_job_cuahsihiscrwaids0, execution_timezone="US/Central") -def implnet_sch_cuahsihiscrwaids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscuisoids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscuisoids0.py deleted file mode 100644 index ece86067..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiscuisoids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihiscuisoids0 import implnet_job_cuahsihiscuisoids0 - -@schedule(cron_schedule="0 20 10 * *", job=implnet_job_cuahsihiscuisoids0, execution_timezone="US/Central") -def implnet_sch_cuahsihiscuisoids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoarizids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoarizids0.py deleted file mode 100644 index bae4aa6e..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoarizids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisczoarizids0 import implnet_job_cuahsihisczoarizids0 - -@schedule(cron_schedule="0 16 18 * *", job=implnet_job_cuahsihisczoarizids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisczoarizids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoboulderids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoboulderids0.py deleted file mode 100644 index d9e166b1..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoboulderids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisczoboulderids0 import implnet_job_cuahsihisczoboulderids0 - -@schedule(cron_schedule="0 4 8 * *", job=implnet_job_cuahsihisczoboulderids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisczoboulderids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczocatalinaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczocatalinaids0.py deleted file mode 100644 index 25e2ff35..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczocatalinaids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisczocatalinaids0 import implnet_job_cuahsihisczocatalinaids0 - -@schedule(cron_schedule="0 0 15 * *", job=implnet_job_cuahsihisczocatalinaids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisczocatalinaids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoluquilloids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoluquilloids0.py deleted file mode 100644 index ee0b5b44..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoluquilloids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisczoluquilloids0 import implnet_job_cuahsihisczoluquilloids0 - -@schedule(cron_schedule="0 8 11 * *", job=implnet_job_cuahsihisczoluquilloids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisczoluquilloids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczomercedids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczomercedids0.py deleted file mode 100644 index ae338e42..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczomercedids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisczomercedids0 import implnet_job_cuahsihisczomercedids0 - -@schedule(cron_schedule="0 16 8 * *", job=implnet_job_cuahsihisczomercedids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisczomercedids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczopsuids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczopsuids0.py deleted file mode 100644 index 23487d21..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczopsuids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisczopsuids0 import implnet_job_cuahsihisczopsuids0 - -@schedule(cron_schedule="0 8 19 * *", job=implnet_job_cuahsihisczopsuids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisczopsuids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoudelids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoudelids0.py deleted file mode 100644 index b512c419..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisczoudelids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisczoudelids0 import implnet_job_cuahsihisczoudelids0 - -@schedule(cron_schedule="0 4 12 * *", job=implnet_job_cuahsihisczoudelids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisczoudelids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisdrwiids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisdrwiids0.py deleted file mode 100644 index 008c9c6f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisdrwiids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisdrwiids0 import implnet_job_cuahsihisdrwiids0 - -@schedule(cron_schedule="0 16 17 * *", job=implnet_job_cuahsihisdrwiids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisdrwiids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisfarmrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisfarmrwaids0.py deleted file mode 100644 index f85b0f22..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisfarmrwaids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisfarmrwaids0 import implnet_job_cuahsihisfarmrwaids0 - -@schedule(cron_schedule="0 8 12 * *", job=implnet_job_cuahsihisfarmrwaids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisfarmrwaids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisfcelterids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisfcelterids0.py deleted file mode 100644 index 9d8b942b..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisfcelterids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisfcelterids0 import implnet_job_cuahsihisfcelterids0 - -@schedule(cron_schedule="0 12 18 * *", job=implnet_job_cuahsihisfcelterids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisfcelterids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisfrcwqmids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisfrcwqmids0.py deleted file mode 100644 index 78f83176..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisfrcwqmids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisfrcwqmids0 import implnet_job_cuahsihisfrcwqmids0 - -@schedule(cron_schedule="0 8 17 * *", job=implnet_job_cuahsihisfrcwqmids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisfrcwqmids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisghcnids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisghcnids0.py deleted file mode 100644 index 3bd64523..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisghcnids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisghcnids0 import implnet_job_cuahsihisghcnids0 - -@schedule(cron_schedule="0 20 8 * *", job=implnet_job_cuahsihisghcnids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisghcnids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisglacialridgeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisglacialridgeids0.py deleted file mode 100644 index b3c54291..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisglacialridgeids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisglacialridgeids0 import implnet_job_cuahsihisglacialridgeids0 - -@schedule(cron_schedule="0 8 18 * *", job=implnet_job_cuahsihisglacialridgeids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisglacialridgeids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleonauburnids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleonauburnids0.py deleted file mode 100644 index e6b3de9c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleonauburnids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisgleonauburnids0 import implnet_job_cuahsihisgleonauburnids0 - -@schedule(cron_schedule="0 16 19 * *", job=implnet_job_cuahsihisgleonauburnids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisgleonauburnids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleondorsetids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleondorsetids0.py deleted file mode 100644 index 356c75a3..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleondorsetids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisgleondorsetids0 import implnet_job_cuahsihisgleondorsetids0 - -@schedule(cron_schedule="0 8 10 * *", job=implnet_job_cuahsihisgleondorsetids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisgleondorsetids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleonlakeannieids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleonlakeannieids0.py deleted file mode 100644 index 045a046b..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleonlakeannieids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisgleonlakeannieids0 import implnet_job_cuahsihisgleonlakeannieids0 - -@schedule(cron_schedule="0 8 9 * *", job=implnet_job_cuahsihisgleonlakeannieids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisgleonlakeannieids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleonsunapeeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleonsunapeeids0.py deleted file mode 100644 index 28b1ac9c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgleonsunapeeids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisgleonsunapeeids0 import implnet_job_cuahsihisgleonsunapeeids0 - -@schedule(cron_schedule="0 12 13 * *", job=implnet_job_cuahsihisgleonsunapeeids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisgleonsunapeeids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisglobalriversobservatoryids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisglobalriversobservatoryids0.py deleted file mode 100644 index a74d69a5..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisglobalriversobservatoryids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisglobalriversobservatoryids0 import implnet_job_cuahsihisglobalriversobservatoryids0 - -@schedule(cron_schedule="0 16 7 * *", job=implnet_job_cuahsihisglobalriversobservatoryids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisglobalriversobservatoryids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgonggaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgonggaids0.py deleted file mode 100644 index d058b0b4..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisgonggaids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisgonggaids0 import implnet_job_cuahsihisgonggaids0 - -@schedule(cron_schedule="0 20 20 * *", job=implnet_job_cuahsihisgonggaids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisgonggaids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihishassbergeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihishassbergeids0.py deleted file mode 100644 index fcac8846..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihishassbergeids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihishassbergeids0 import implnet_job_cuahsihishassbergeids0 - -@schedule(cron_schedule="0 12 20 * *", job=implnet_job_cuahsihishassbergeids0, execution_timezone="US/Central") -def implnet_sch_cuahsihishassbergeids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihishydrodataczdids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihishydrodataczdids0.py deleted file mode 100644 index 2b580b0f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihishydrodataczdids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihishydrodataczdids0 import implnet_job_cuahsihishydrodataczdids0 - -@schedule(cron_schedule="0 12 17 * *", job=implnet_job_cuahsihishydrodataczdids0, execution_timezone="US/Central") -def implnet_sch_cuahsihishydrodataczdids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihishydrodataczhrids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihishydrodataczhrids0.py deleted file mode 100644 index 28bbab3d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihishydrodataczhrids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihishydrodataczhrids0 import implnet_job_cuahsihishydrodataczhrids0 - -@schedule(cron_schedule="0 4 7 * *", job=implnet_job_cuahsihishydrodataczhrids0, execution_timezone="US/Central") -def implnet_sch_cuahsihishydrodataczhrids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisieeratwilkesuniversityids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisieeratwilkesuniversityids0.py deleted file mode 100644 index 02cbbb56..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisieeratwilkesuniversityids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisieeratwilkesuniversityids0 import implnet_job_cuahsihisieeratwilkesuniversityids0 - -@schedule(cron_schedule="0 4 15 * *", job=implnet_job_cuahsihisieeratwilkesuniversityids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisieeratwilkesuniversityids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisirwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisirwaids0.py deleted file mode 100644 index f74db6cf..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisirwaids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisirwaids0 import implnet_job_cuahsihisirwaids0 - -@schedule(cron_schedule="0 4 11 * *", job=implnet_job_cuahsihisirwaids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisirwaids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisisbenaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisisbenaids0.py deleted file mode 100644 index d09f3907..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisisbenaids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisisbenaids0 import implnet_job_cuahsihisisbenaids0 - -@schedule(cron_schedule="0 0 20 * *", job=implnet_job_cuahsihisisbenaids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisisbenaids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiskansasweatherdataids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiskansasweatherdataids0.py deleted file mode 100644 index b676de2e..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihiskansasweatherdataids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihiskansasweatherdataids0 import implnet_job_cuahsihiskansasweatherdataids0 - -@schedule(cron_schedule="0 0 10 * *", job=implnet_job_cuahsihiskansasweatherdataids0, execution_timezone="US/Central") -def implnet_sch_cuahsihiskansasweatherdataids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislaselvastreamdischargeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislaselvastreamdischargeids0.py deleted file mode 100644 index cde52a52..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislaselvastreamdischargeids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihislaselvastreamdischargeids0 import implnet_job_cuahsihislaselvastreamdischargeids0 - -@schedule(cron_schedule="0 20 19 * *", job=implnet_job_cuahsihislaselvastreamdischargeids0, execution_timezone="US/Central") -def implnet_sch_cuahsihislaselvastreamdischargeids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislczoodm2ids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislczoodm2ids0.py deleted file mode 100644 index e30ebcac..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislczoodm2ids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihislczoodm2ids0 import implnet_job_cuahsihislczoodm2ids0 - -@schedule(cron_schedule="0 12 14 * *", job=implnet_job_cuahsihislczoodm2ids0, execution_timezone="US/Central") -def implnet_sch_cuahsihislczoodm2ids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislittlebearriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislittlebearriverids0.py deleted file mode 100644 index dc0578ff..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislittlebearriverids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihislittlebearriverids0 import implnet_job_cuahsihislittlebearriverids0 - -@schedule(cron_schedule="0 16 16 * *", job=implnet_job_cuahsihislittlebearriverids0, execution_timezone="US/Central") -def implnet_sch_cuahsihislittlebearriverids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisloganrivergamutids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisloganrivergamutids0.py deleted file mode 100644 index 0ad075c0..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisloganrivergamutids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisloganrivergamutids0 import implnet_job_cuahsihisloganrivergamutids0 - -@schedule(cron_schedule="0 12 16 * *", job=implnet_job_cuahsihisloganrivergamutids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisloganrivergamutids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisloganriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisloganriverids0.py deleted file mode 100644 index 6af511f8..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisloganriverids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisloganriverids0 import implnet_job_cuahsihisloganriverids0 - -@schedule(cron_schedule="0 16 15 * *", job=implnet_job_cuahsihisloganriverids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisloganriverids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislterntlwoodruffids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislterntlwoodruffids0.py deleted file mode 100644 index 6b7aa5e6..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihislterntlwoodruffids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihislterntlwoodruffids0 import implnet_job_cuahsihislterntlwoodruffids0 - -@schedule(cron_schedule="0 20 16 * *", job=implnet_job_cuahsihislterntlwoodruffids0, execution_timezone="US/Central") -def implnet_sch_cuahsihislterntlwoodruffids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisluwlids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisluwlids0.py deleted file mode 100644 index 491cff9c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisluwlids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisluwlids0 import implnet_job_cuahsihisluwlids0 - -@schedule(cron_schedule="0 12 9 * *", job=implnet_job_cuahsihisluwlids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisluwlids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismaaeriids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismaaeriids0.py deleted file mode 100644 index 201e8a1c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismaaeriids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihismaaeriids0 import implnet_job_cuahsihismaaeriids0 - -@schedule(cron_schedule="0 0 14 * *", job=implnet_job_cuahsihismaaeriids0, execution_timezone="US/Central") -def implnet_sch_cuahsihismaaeriids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismazarriverprojectids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismazarriverprojectids0.py deleted file mode 100644 index a087e02f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismazarriverprojectids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihismazarriverprojectids0 import implnet_job_cuahsihismazarriverprojectids0 - -@schedule(cron_schedule="0 8 13 * *", job=implnet_job_cuahsihismazarriverprojectids0, execution_timezone="US/Central") -def implnet_sch_cuahsihismazarriverprojectids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismmaatacamaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismmaatacamaids0.py deleted file mode 100644 index fc42e187..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismmaatacamaids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihismmaatacamaids0 import implnet_job_cuahsihismmaatacamaids0 - -@schedule(cron_schedule="0 0 9 * *", job=implnet_job_cuahsihismmaatacamaids0, execution_timezone="US/Central") -def implnet_sch_cuahsihismmaatacamaids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismobilecrowdhydrologyids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismobilecrowdhydrologyids0.py deleted file mode 100644 index 8b415d15..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismobilecrowdhydrologyids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihismobilecrowdhydrologyids0 import implnet_job_cuahsihismobilecrowdhydrologyids0 - -@schedule(cron_schedule="0 4 16 * *", job=implnet_job_cuahsihismobilecrowdhydrologyids0, execution_timezone="US/Central") -def implnet_sch_cuahsihismobilecrowdhydrologyids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismopexids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismopexids0.py deleted file mode 100644 index 148d4f7a..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismopexids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihismopexids0 import implnet_job_cuahsihismopexids0 - -@schedule(cron_schedule="0 0 21 * *", job=implnet_job_cuahsihismopexids0, execution_timezone="US/Central") -def implnet_sch_cuahsihismopexids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismuddyriverids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismuddyriverids0.py deleted file mode 100644 index 94a02c32..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismuddyriverids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihismuddyriverids0 import implnet_job_cuahsihismuddyriverids0 - -@schedule(cron_schedule="0 12 8 * *", job=implnet_job_cuahsihismuddyriverids0, execution_timezone="US/Central") -def implnet_sch_cuahsihismuddyriverids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismudlakeids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismudlakeids0.py deleted file mode 100644 index d791ae51..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismudlakeids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihismudlakeids0 import implnet_job_cuahsihismudlakeids0 - -@schedule(cron_schedule="0 8 15 * *", job=implnet_job_cuahsihismudlakeids0, execution_timezone="US/Central") -def implnet_sch_cuahsihismudlakeids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismwdisids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismwdisids0.py deleted file mode 100644 index 1ac8f637..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismwdisids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihismwdisids0 import implnet_job_cuahsihismwdisids0 - -@schedule(cron_schedule="0 12 15 * *", job=implnet_job_cuahsihismwdisids0, execution_timezone="US/Central") -def implnet_sch_cuahsihismwdisids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismwraids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismwraids0.py deleted file mode 100644 index e0b5f4bb..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihismwraids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihismwraids0 import implnet_job_cuahsihismwraids0 - -@schedule(cron_schedule="0 20 13 * *", job=implnet_job_cuahsihismwraids0, execution_timezone="US/Central") -def implnet_sch_cuahsihismwraids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnashrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnashrwaids0.py deleted file mode 100644 index 11f0da34..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnashrwaids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisnashrwaids0 import implnet_job_cuahsihisnashrwaids0 - -@schedule(cron_schedule="0 0 16 * *", job=implnet_job_cuahsihisnashrwaids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisnashrwaids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnceiww2ids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnceiww2ids0.py deleted file mode 100644 index 857f66f5..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnceiww2ids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisnceiww2ids0 import implnet_job_cuahsihisnceiww2ids0 - -@schedule(cron_schedule="0 4 14 * *", job=implnet_job_cuahsihisnceiww2ids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisnceiww2ids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisneonids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisneonids0.py deleted file mode 100644 index 3ad1b8e5..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisneonids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisneonids0 import implnet_job_cuahsihisneonids0 - -@schedule(cron_schedule="0 12 7 * *", job=implnet_job_cuahsihisneonids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisneonids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnevadosids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnevadosids0.py deleted file mode 100644 index d202dbfe..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnevadosids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisnevadosids0 import implnet_job_cuahsihisnevadosids0 - -@schedule(cron_schedule="0 0 13 * *", job=implnet_job_cuahsihisnevadosids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisnevadosids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnevcanids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnevcanids0.py deleted file mode 100644 index 92b13937..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnevcanids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisnevcanids0 import implnet_job_cuahsihisnevcanids0 - -@schedule(cron_schedule="0 4 19 * *", job=implnet_job_cuahsihisnevcanids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisnevcanids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnewnids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnewnids0.py deleted file mode 100644 index 40a3a618..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnewnids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisnewnids0 import implnet_job_cuahsihisnewnids0 - -@schedule(cron_schedule="0 0 12 * *", job=implnet_job_cuahsihisnewnids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisnewnids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnhgswofids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnhgswofids0.py deleted file mode 100644 index a739aa88..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnhgswofids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisnhgswofids0 import implnet_job_cuahsihisnhgswofids0 - -@schedule(cron_schedule="0 16 20 * *", job=implnet_job_cuahsihisnhgswofids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisnhgswofids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnooksackmicroclimatenetworkids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnooksackmicroclimatenetworkids0.py deleted file mode 100644 index 1d5d40d2..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisnooksackmicroclimatenetworkids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisnooksackmicroclimatenetworkids0 import implnet_job_cuahsihisnooksackmicroclimatenetworkids0 - -@schedule(cron_schedule="0 8 7 * *", job=implnet_job_cuahsihisnooksackmicroclimatenetworkids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisnooksackmicroclimatenetworkids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisodmkentstateids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisodmkentstateids0.py deleted file mode 100644 index 27ca639f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisodmkentstateids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisodmkentstateids0 import implnet_job_cuahsihisodmkentstateids0 - -@schedule(cron_schedule="0 4 10 * *", job=implnet_job_cuahsihisodmkentstateids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisodmkentstateids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisorsancohabids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisorsancohabids0.py deleted file mode 100644 index 92379894..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisorsancohabids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisorsancohabids0 import implnet_job_cuahsihisorsancohabids0 - -@schedule(cron_schedule="0 16 13 * *", job=implnet_job_cuahsihisorsancohabids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisorsancohabids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihispanolaodmids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihispanolaodmids0.py deleted file mode 100644 index b104ead5..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihispanolaodmids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihispanolaodmids0 import implnet_job_cuahsihispanolaodmids0 - -@schedule(cron_schedule="0 20 11 * *", job=implnet_job_cuahsihispanolaodmids0, execution_timezone="US/Central") -def implnet_sch_cuahsihispanolaodmids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisparalanaturalezaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisparalanaturalezaids0.py deleted file mode 100644 index 9be4e39b..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisparalanaturalezaids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisparalanaturalezaids0 import implnet_job_cuahsihisparalanaturalezaids0 - -@schedule(cron_schedule="0 20 14 * *", job=implnet_job_cuahsihisparalanaturalezaids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisparalanaturalezaids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisprovorivergamutids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisprovorivergamutids0.py deleted file mode 100644 index dd9a83b0..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisprovorivergamutids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisprovorivergamutids0 import implnet_job_cuahsihisprovorivergamutids0 - -@schedule(cron_schedule="0 0 11 * *", job=implnet_job_cuahsihisprovorivergamutids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisprovorivergamutids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisredbuttecreekgamutids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisredbuttecreekgamutids0.py deleted file mode 100644 index b92891a3..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisredbuttecreekgamutids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisredbuttecreekgamutids0 import implnet_job_cuahsihisredbuttecreekgamutids0 - -@schedule(cron_schedule="0 4 18 * *", job=implnet_job_cuahsihisredbuttecreekgamutids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisredbuttecreekgamutids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisrmblids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisrmblids0.py deleted file mode 100644 index 8331350a..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisrmblids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisrmblids0 import implnet_job_cuahsihisrmblids0 - -@schedule(cron_schedule="0 16 11 * *", job=implnet_job_cuahsihisrmblids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisrmblids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihissagehencreekids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihissagehencreekids0.py deleted file mode 100644 index f2439e2f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihissagehencreekids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihissagehencreekids0 import implnet_job_cuahsihissagehencreekids0 - -@schedule(cron_schedule="0 0 17 * *", job=implnet_job_cuahsihissagehencreekids0, execution_timezone="US/Central") -def implnet_sch_cuahsihissagehencreekids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisscanids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisscanids0.py deleted file mode 100644 index c9e7f9e5..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisscanids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisscanids0 import implnet_job_cuahsihisscanids0 - -@schedule(cron_schedule="0 20 15 * *", job=implnet_job_cuahsihisscanids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisscanids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisscotlandnwisids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisscotlandnwisids0.py deleted file mode 100644 index bca44679..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisscotlandnwisids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisscotlandnwisids0 import implnet_job_cuahsihisscotlandnwisids0 - -@schedule(cron_schedule="0 0 8 * *", job=implnet_job_cuahsihisscotlandnwisids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisscotlandnwisids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisshalenetworkodmids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisshalenetworkodmids0.py deleted file mode 100644 index 8bf790c8..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisshalenetworkodmids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisshalenetworkodmids0 import implnet_job_cuahsihisshalenetworkodmids0 - -@schedule(cron_schedule="0 20 12 * *", job=implnet_job_cuahsihisshalenetworkodmids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisshalenetworkodmids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisshalenetworkodmids1.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisshalenetworkodmids1.py deleted file mode 100644 index eef4e1a4..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisshalenetworkodmids1.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisshalenetworkodmids1 import implnet_job_cuahsihisshalenetworkodmids1 - -@schedule(cron_schedule="0 4 17 * *", job=implnet_job_cuahsihisshalenetworkodmids1, execution_timezone="US/Central") -def implnet_sch_cuahsihisshalenetworkodmids1(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisskcmilltownids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisskcmilltownids0.py deleted file mode 100644 index 5508cf3c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisskcmilltownids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisskcmilltownids0 import implnet_job_cuahsihisskcmilltownids0 - -@schedule(cron_schedule="0 12 12 * *", job=implnet_job_cuahsihisskcmilltownids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisskcmilltownids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihissnotelids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihissnotelids0.py deleted file mode 100644 index 3fb8e587..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihissnotelids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihissnotelids0 import implnet_job_cuahsihissnotelids0 - -@schedule(cron_schedule="0 0 19 * *", job=implnet_job_cuahsihissnotelids0, execution_timezone="US/Central") -def implnet_sch_cuahsihissnotelids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisswedishmonitoringdataids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisswedishmonitoringdataids0.py deleted file mode 100644 index 5db961b1..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisswedishmonitoringdataids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisswedishmonitoringdataids0 import implnet_job_cuahsihisswedishmonitoringdataids0 - -@schedule(cron_schedule="0 4 20 * *", job=implnet_job_cuahsihisswedishmonitoringdataids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisswedishmonitoringdataids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistarlandwaterqualityids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistarlandwaterqualityids0.py deleted file mode 100644 index 7693e39f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistarlandwaterqualityids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihistarlandwaterqualityids0 import implnet_job_cuahsihistarlandwaterqualityids0 - -@schedule(cron_schedule="0 8 14 * *", job=implnet_job_cuahsihistarlandwaterqualityids0, execution_timezone="US/Central") -def implnet_sch_cuahsihistarlandwaterqualityids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistncwaterdataids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistncwaterdataids0.py deleted file mode 100644 index ffdd4638..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistncwaterdataids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihistncwaterdataids0 import implnet_job_cuahsihistncwaterdataids0 - -@schedule(cron_schedule="0 20 7 * *", job=implnet_job_cuahsihistncwaterdataids0, execution_timezone="US/Central") -def implnet_sch_cuahsihistncwaterdataids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistrwaids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistrwaids0.py deleted file mode 100644 index 96e57ff1..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistrwaids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihistrwaids0 import implnet_job_cuahsihistrwaids0 - -@schedule(cron_schedule="0 0 18 * *", job=implnet_job_cuahsihistrwaids0, execution_timezone="US/Central") -def implnet_sch_cuahsihistrwaids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistuolumnemdwids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistuolumnemdwids0.py deleted file mode 100644 index e32d57ba..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihistuolumnemdwids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihistuolumnemdwids0 import implnet_job_cuahsihistuolumnemdwids0 - -@schedule(cron_schedule="0 12 11 * *", job=implnet_job_cuahsihistuolumnemdwids0, execution_timezone="US/Central") -def implnet_sch_cuahsihistuolumnemdwids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisubwpadids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisubwpadids0.py deleted file mode 100644 index f34b68cd..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisubwpadids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisubwpadids0 import implnet_job_cuahsihisubwpadids0 - -@schedule(cron_schedule="0 20 17 * *", job=implnet_job_cuahsihisubwpadids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisubwpadids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisumbcgwids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisumbcgwids0.py deleted file mode 100644 index d18eed4f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisumbcgwids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisumbcgwids0 import implnet_job_cuahsihisumbcgwids0 - -@schedule(cron_schedule="0 16 12 * *", job=implnet_job_cuahsihisumbcgwids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisumbcgwids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisumbcwqids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisumbcwqids0.py deleted file mode 100644 index 34184964..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisumbcwqids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisumbcwqids0 import implnet_job_cuahsihisumbcwqids0 - -@schedule(cron_schedule="0 4 9 * *", job=implnet_job_cuahsihisumbcwqids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisumbcwqids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisunhsnowids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisunhsnowids0.py deleted file mode 100644 index 95d0853f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisunhsnowids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisunhsnowids0 import implnet_job_cuahsihisunhsnowids0 - -@schedule(cron_schedule="0 8 20 * *", job=implnet_job_cuahsihisunhsnowids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisunhsnowids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisweiherbachids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisweiherbachids0.py deleted file mode 100644 index 86abbb12..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisweiherbachids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisweiherbachids0 import implnet_job_cuahsihisweiherbachids0 - -@schedule(cron_schedule="0 4 13 * *", job=implnet_job_cuahsihisweiherbachids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisweiherbachids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisyosemitehydroclimatenetworkids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisyosemitehydroclimatenetworkids0.py deleted file mode 100644 index 60eec163..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_cuahsihisyosemitehydroclimatenetworkids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuahsihisyosemitehydroclimatenetworkids0 import implnet_job_cuahsihisyosemitehydroclimatenetworkids0 - -@schedule(cron_schedule="0 8 8 * *", job=implnet_job_cuahsihisyosemitehydroclimatenetworkids0, execution_timezone="US/Central") -def implnet_sch_cuahsihisyosemitehydroclimatenetworkids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_dams0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_dams0.py deleted file mode 100644 index 4e1dac0f..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_dams0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_dams0 import implnet_job_dams0 - -@schedule(cron_schedule="0 12 23 * *", job=implnet_job_dams0, execution_timezone="US/Central") -def implnet_sch_dams0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_dams1.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_dams1.py deleted file mode 100644 index d9257d08..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_dams1.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_dams1 import implnet_job_dams1 - -@schedule(cron_schedule="0 16 23 * *", job=implnet_job_dams1, execution_timezone="US/Central") -def implnet_sch_dams1(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_damspids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_damspids0.py deleted file mode 100644 index 864b041c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_damspids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_damspids0 import implnet_job_damspids0 - -@schedule(cron_schedule="0 0 7 * *", job=implnet_job_damspids0, execution_timezone="US/Central") -def implnet_sch_damspids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_demo0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_demo0.py deleted file mode 100644 index 85ea1e34..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_demo0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_demo0 import implnet_job_demo0 - -@schedule(cron_schedule="0 12 27 * *", job=implnet_job_demo0, execution_timezone="US/Central") -def implnet_sch_demo0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_gfv11pois0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_gfv11pois0.py deleted file mode 100644 index ced09e2d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_gfv11pois0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_gfv11pois0 import implnet_job_gfv11pois0 - -@schedule(cron_schedule="0 16 6 * *", job=implnet_job_gfv11pois0, execution_timezone="US/Central") -def implnet_sch_gfv11pois0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_gfv11pois1.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_gfv11pois1.py deleted file mode 100644 index 0dd95b15..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_gfv11pois1.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_gfv11pois1 import implnet_job_gfv11pois1 - -@schedule(cron_schedule="0 12 6 * *", job=implnet_job_gfv11pois1, execution_timezone="US/Central") -def implnet_sch_gfv11pois1(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hmw0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hmw0.py deleted file mode 100644 index 150d8862..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hmw0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_hmw0 import implnet_job_hmw0 - -@schedule(cron_schedule="0 0 25 * *", job=implnet_job_hmw0, execution_timezone="US/Central") -def implnet_sch_hmw0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hmw1.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hmw1.py deleted file mode 100644 index ab65446c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hmw1.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_hmw1 import implnet_job_hmw1 - -@schedule(cron_schedule="0 4 25 * *", job=implnet_job_hmw1, execution_timezone="US/Central") -def implnet_sch_hmw1(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu020.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu020.py deleted file mode 100644 index 0bbb1e18..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu020.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_hu020 import implnet_job_hu020 - -@schedule(cron_schedule="0 12 24 * *", job=implnet_job_hu020, execution_timezone="US/Central") -def implnet_sch_hu020(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu040.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu040.py deleted file mode 100644 index 116970d9..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu040.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_hu040 import implnet_job_hu040 - -@schedule(cron_schedule="0 12 22 * *", job=implnet_job_hu040, execution_timezone="US/Central") -def implnet_sch_hu040(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu060.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu060.py deleted file mode 100644 index 25998aa2..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu060.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_hu060 import implnet_job_hu060 - -@schedule(cron_schedule="0 20 21 * *", job=implnet_job_hu060, execution_timezone="US/Central") -def implnet_sch_hu060(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu080.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu080.py deleted file mode 100644 index 4475cf5d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu080.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_hu080 import implnet_job_hu080 - -@schedule(cron_schedule="0 8 22 * *", job=implnet_job_hu080, execution_timezone="US/Central") -def implnet_sch_hu080(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu100.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu100.py deleted file mode 100644 index e358d5a2..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hu100.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_hu100 import implnet_job_hu100 - -@schedule(cron_schedule="0 4 24 * *", job=implnet_job_hu100, execution_timezone="US/Central") -def implnet_sch_hu100(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_huc12pp0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_huc12pp0.py deleted file mode 100644 index 5bfa5a67..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_huc12pp0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_huc12pp0 import implnet_job_huc12pp0 - -@schedule(cron_schedule="0 8 25 * *", job=implnet_job_huc12pp0, execution_timezone="US/Central") -def implnet_sch_huc12pp0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_huc12pp1.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_huc12pp1.py deleted file mode 100644 index c1e6c81c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_huc12pp1.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_huc12pp1 import implnet_job_huc12pp1 - -@schedule(cron_schedule="0 12 25 * *", job=implnet_job_huc12pp1, execution_timezone="US/Central") -def implnet_sch_huc12pp1(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hydrologicunit0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hydrologicunit0.py deleted file mode 100644 index babd25af..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_hydrologicunit0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_hydrologicunit0 import implnet_job_hydrologicunit0 - -@schedule(cron_schedule="0 20 6 * *", job=implnet_job_hydrologicunit0, execution_timezone="US/Central") -def implnet_sch_hydrologicunit0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_links0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_links0.py deleted file mode 100644 index adca26f5..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_links0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_links0 import implnet_job_links0 - -@schedule(cron_schedule="0 8 27 * *", job=implnet_job_links0, execution_timezone="US/Central") -def implnet_sch_links0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_mainstems0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_mainstems0.py deleted file mode 100644 index 95b80cee..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_mainstems0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_mainstems0 import implnet_job_mainstems0 - -@schedule(cron_schedule="0 16 24 * *", job=implnet_job_mainstems0, execution_timezone="US/Central") -def implnet_sch_mainstems0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nataq0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nataq0.py deleted file mode 100644 index c9c7c350..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nataq0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nataq0 import implnet_job_nataq0 - -@schedule(cron_schedule="0 0 22 * *", job=implnet_job_nataq0, execution_timezone="US/Central") -def implnet_sch_nataq0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose0.py deleted file mode 100644 index 0e14317c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nmwdiose0 import implnet_job_nmwdiose0 - -@schedule(cron_schedule="0 0 26 * *", job=implnet_job_nmwdiose0, execution_timezone="US/Central") -def implnet_sch_nmwdiose0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose1.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose1.py deleted file mode 100644 index d5a706c9..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose1.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nmwdiose1 import implnet_job_nmwdiose1 - -@schedule(cron_schedule="0 8 26 * *", job=implnet_job_nmwdiose1, execution_timezone="US/Central") -def implnet_sch_nmwdiose1(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose2.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose2.py deleted file mode 100644 index eee08799..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose2.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nmwdiose2 import implnet_job_nmwdiose2 - -@schedule(cron_schedule="0 20 25 * *", job=implnet_job_nmwdiose2, execution_timezone="US/Central") -def implnet_sch_nmwdiose2(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose3.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose3.py deleted file mode 100644 index 81a39e99..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose3.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nmwdiose3 import implnet_job_nmwdiose3 - -@schedule(cron_schedule="0 16 25 * *", job=implnet_job_nmwdiose3, execution_timezone="US/Central") -def implnet_sch_nmwdiose3(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose4.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose4.py deleted file mode 100644 index ae6c0a4a..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdiose4.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nmwdiose4 import implnet_job_nmwdiose4 - -@schedule(cron_schedule="0 4 26 * *", job=implnet_job_nmwdiose4, execution_timezone="US/Central") -def implnet_sch_nmwdiose4(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdist0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdist0.py deleted file mode 100644 index 8cc30594..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nmwdist0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nmwdist0 import implnet_job_nmwdist0 - -@schedule(cron_schedule="0 12 26 * *", job=implnet_job_nmwdist0, execution_timezone="US/Central") -def implnet_sch_nmwdist0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw0.py deleted file mode 100644 index a707538a..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw0 import implnet_job_nwisgw0 - -@schedule(cron_schedule="0 16 5 * *", job=implnet_job_nwisgw0, execution_timezone="US/Central") -def implnet_sch_nwisgw0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw1.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw1.py deleted file mode 100644 index 3e6826c5..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw1.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw1 import implnet_job_nwisgw1 - -@schedule(cron_schedule="0 8 4 * *", job=implnet_job_nwisgw1, execution_timezone="US/Central") -def implnet_sch_nwisgw1(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw10.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw10.py deleted file mode 100644 index 41ab4c13..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw10.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw10 import implnet_job_nwisgw10 - -@schedule(cron_schedule="0 4 2 * *", job=implnet_job_nwisgw10, execution_timezone="US/Central") -def implnet_sch_nwisgw10(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw11.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw11.py deleted file mode 100644 index 3ce3fc53..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw11.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw11 import implnet_job_nwisgw11 - -@schedule(cron_schedule="0 12 5 * *", job=implnet_job_nwisgw11, execution_timezone="US/Central") -def implnet_sch_nwisgw11(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw12.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw12.py deleted file mode 100644 index 56f993ba..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw12.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw12 import implnet_job_nwisgw12 - -@schedule(cron_schedule="0 12 1 * *", job=implnet_job_nwisgw12, execution_timezone="US/Central") -def implnet_sch_nwisgw12(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw13.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw13.py deleted file mode 100644 index 2b5d4a44..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw13.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw13 import implnet_job_nwisgw13 - -@schedule(cron_schedule="0 16 3 * *", job=implnet_job_nwisgw13, execution_timezone="US/Central") -def implnet_sch_nwisgw13(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw14.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw14.py deleted file mode 100644 index 3d6496e9..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw14.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw14 import implnet_job_nwisgw14 - -@schedule(cron_schedule="0 20 1 * *", job=implnet_job_nwisgw14, execution_timezone="US/Central") -def implnet_sch_nwisgw14(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw15.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw15.py deleted file mode 100644 index 509bcc69..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw15.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw15 import implnet_job_nwisgw15 - -@schedule(cron_schedule="0 8 2 * *", job=implnet_job_nwisgw15, execution_timezone="US/Central") -def implnet_sch_nwisgw15(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw16.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw16.py deleted file mode 100644 index ebc73ed8..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw16.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw16 import implnet_job_nwisgw16 - -@schedule(cron_schedule="0 8 1 * *", job=implnet_job_nwisgw16, execution_timezone="US/Central") -def implnet_sch_nwisgw16(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw17.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw17.py deleted file mode 100644 index f495f454..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw17.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw17 import implnet_job_nwisgw17 - -@schedule(cron_schedule="0 0 5 * *", job=implnet_job_nwisgw17, execution_timezone="US/Central") -def implnet_sch_nwisgw17(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw18.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw18.py deleted file mode 100644 index b07ae31a..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw18.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw18 import implnet_job_nwisgw18 - -@schedule(cron_schedule="0 4 5 * *", job=implnet_job_nwisgw18, execution_timezone="US/Central") -def implnet_sch_nwisgw18(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw19.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw19.py deleted file mode 100644 index edcb74fd..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw19.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw19 import implnet_job_nwisgw19 - -@schedule(cron_schedule="0 0 3 * *", job=implnet_job_nwisgw19, execution_timezone="US/Central") -def implnet_sch_nwisgw19(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw2.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw2.py deleted file mode 100644 index 7abba808..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw2.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw2 import implnet_job_nwisgw2 - -@schedule(cron_schedule="0 12 2 * *", job=implnet_job_nwisgw2, execution_timezone="US/Central") -def implnet_sch_nwisgw2(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw20.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw20.py deleted file mode 100644 index ca0531d9..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw20.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw20 import implnet_job_nwisgw20 - -@schedule(cron_schedule="0 0 1 * *", job=implnet_job_nwisgw20, execution_timezone="US/Central") -def implnet_sch_nwisgw20(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw21.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw21.py deleted file mode 100644 index e1067dcb..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw21.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw21 import implnet_job_nwisgw21 - -@schedule(cron_schedule="0 12 4 * *", job=implnet_job_nwisgw21, execution_timezone="US/Central") -def implnet_sch_nwisgw21(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw22.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw22.py deleted file mode 100644 index 6db0b849..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw22.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw22 import implnet_job_nwisgw22 - -@schedule(cron_schedule="0 4 1 * *", job=implnet_job_nwisgw22, execution_timezone="US/Central") -def implnet_sch_nwisgw22(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw23.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw23.py deleted file mode 100644 index 0f821b3b..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw23.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw23 import implnet_job_nwisgw23 - -@schedule(cron_schedule="0 0 2 * *", job=implnet_job_nwisgw23, execution_timezone="US/Central") -def implnet_sch_nwisgw23(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw24.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw24.py deleted file mode 100644 index 0980e281..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw24.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw24 import implnet_job_nwisgw24 - -@schedule(cron_schedule="0 16 2 * *", job=implnet_job_nwisgw24, execution_timezone="US/Central") -def implnet_sch_nwisgw24(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw25.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw25.py deleted file mode 100644 index be96a0d1..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw25.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw25 import implnet_job_nwisgw25 - -@schedule(cron_schedule="0 16 1 * *", job=implnet_job_nwisgw25, execution_timezone="US/Central") -def implnet_sch_nwisgw25(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw26.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw26.py deleted file mode 100644 index fd2da44a..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw26.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw26 import implnet_job_nwisgw26 - -@schedule(cron_schedule="0 8 3 * *", job=implnet_job_nwisgw26, execution_timezone="US/Central") -def implnet_sch_nwisgw26(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw27.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw27.py deleted file mode 100644 index 344a3223..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw27.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw27 import implnet_job_nwisgw27 - -@schedule(cron_schedule="0 16 4 * *", job=implnet_job_nwisgw27, execution_timezone="US/Central") -def implnet_sch_nwisgw27(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw28.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw28.py deleted file mode 100644 index f3eadc99..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw28.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw28 import implnet_job_nwisgw28 - -@schedule(cron_schedule="0 4 3 * *", job=implnet_job_nwisgw28, execution_timezone="US/Central") -def implnet_sch_nwisgw28(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw3.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw3.py deleted file mode 100644 index 9d918109..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw3.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw3 import implnet_job_nwisgw3 - -@schedule(cron_schedule="0 0 4 * *", job=implnet_job_nwisgw3, execution_timezone="US/Central") -def implnet_sch_nwisgw3(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw4.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw4.py deleted file mode 100644 index b1a0ceee..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw4.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw4 import implnet_job_nwisgw4 - -@schedule(cron_schedule="0 4 4 * *", job=implnet_job_nwisgw4, execution_timezone="US/Central") -def implnet_sch_nwisgw4(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw5.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw5.py deleted file mode 100644 index 3900514a..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw5.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw5 import implnet_job_nwisgw5 - -@schedule(cron_schedule="0 12 3 * *", job=implnet_job_nwisgw5, execution_timezone="US/Central") -def implnet_sch_nwisgw5(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw6.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw6.py deleted file mode 100644 index addbec36..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw6.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw6 import implnet_job_nwisgw6 - -@schedule(cron_schedule="0 20 3 * *", job=implnet_job_nwisgw6, execution_timezone="US/Central") -def implnet_sch_nwisgw6(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw7.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw7.py deleted file mode 100644 index 0f8e4348..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw7.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw7 import implnet_job_nwisgw7 - -@schedule(cron_schedule="0 8 5 * *", job=implnet_job_nwisgw7, execution_timezone="US/Central") -def implnet_sch_nwisgw7(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw8.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw8.py deleted file mode 100644 index 4b45d89e..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw8.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw8 import implnet_job_nwisgw8 - -@schedule(cron_schedule="0 20 4 * *", job=implnet_job_nwisgw8, execution_timezone="US/Central") -def implnet_sch_nwisgw8(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw9.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw9.py deleted file mode 100644 index f0090f77..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwisgw9.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwisgw9 import implnet_job_nwisgw9 - -@schedule(cron_schedule="0 20 2 * *", job=implnet_job_nwisgw9, execution_timezone="US/Central") -def implnet_sch_nwisgw9(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite0.py deleted file mode 100644 index 6d155641..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwissite0 import implnet_job_nwissite0 - -@schedule(cron_schedule="0 4 6 * *", job=implnet_job_nwissite0, execution_timezone="US/Central") -def implnet_sch_nwissite0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite1.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite1.py deleted file mode 100644 index 9045a707..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite1.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwissite1 import implnet_job_nwissite1 - -@schedule(cron_schedule="0 20 5 * *", job=implnet_job_nwissite1, execution_timezone="US/Central") -def implnet_sch_nwissite1(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite2.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite2.py deleted file mode 100644 index eff433c4..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite2.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwissite2 import implnet_job_nwissite2 - -@schedule(cron_schedule="0 8 6 * *", job=implnet_job_nwissite2, execution_timezone="US/Central") -def implnet_sch_nwissite2(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite3.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite3.py deleted file mode 100644 index e8a88282..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_nwissite3.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nwissite3 import implnet_job_nwissite3 - -@schedule(cron_schedule="0 0 6 * *", job=implnet_job_nwissite3, execution_timezone="US/Central") -def implnet_sch_nwissite3(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_places0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_places0.py deleted file mode 100644 index 0a0ad632..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_places0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_places0 import implnet_job_places0 - -@schedule(cron_schedule="0 20 24 * *", job=implnet_job_places0, execution_timezone="US/Central") -def implnet_sch_places0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_princiaq0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_princiaq0.py deleted file mode 100644 index e33474ce..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_princiaq0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_princiaq0 import implnet_job_princiaq0 - -@schedule(cron_schedule="0 16 22 * *", job=implnet_job_princiaq0, execution_timezone="US/Central") -def implnet_sch_princiaq0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_pws0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_pws0.py deleted file mode 100644 index 8defec9d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_pws0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_pws0 import implnet_job_pws0 - -@schedule(cron_schedule="0 16 21 * *", job=implnet_job_pws0, execution_timezone="US/Central") -def implnet_sch_pws0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage0.py deleted file mode 100644 index 5dbcb2ae..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_refgage0 import implnet_job_refgage0 - -@schedule(cron_schedule="0 20 22 * *", job=implnet_job_refgage0, execution_timezone="US/Central") -def implnet_sch_refgage0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage1.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage1.py deleted file mode 100644 index 30e8c638..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage1.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_refgage1 import implnet_job_refgage1 - -@schedule(cron_schedule="0 8 23 * *", job=implnet_job_refgage1, execution_timezone="US/Central") -def implnet_sch_refgage1(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage2.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage2.py deleted file mode 100644 index 7038c026..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage2.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_refgage2 import implnet_job_refgage2 - -@schedule(cron_schedule="0 4 23 * *", job=implnet_job_refgage2, execution_timezone="US/Central") -def implnet_sch_refgage2(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage3.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage3.py deleted file mode 100644 index 106951da..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_refgage3.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_refgage3 import implnet_job_refgage3 - -@schedule(cron_schedule="0 0 23 * *", job=implnet_job_refgage3, execution_timezone="US/Central") -def implnet_sch_refgage3(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_rise0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_rise0.py deleted file mode 100644 index 4efb4c9d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_rise0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_rise0 import implnet_job_rise0 - -@schedule(cron_schedule="0 0 27 * *", job=implnet_job_rise0, execution_timezone="US/Central") -def implnet_sch_rise0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_sechydrgreg0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_sechydrgreg0.py deleted file mode 100644 index 64355867..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_sechydrgreg0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_sechydrgreg0 import implnet_job_sechydrgreg0 - -@schedule(cron_schedule="0 8 21 * *", job=implnet_job_sechydrgreg0, execution_timezone="US/Central") -def implnet_sch_sechydrgreg0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_selfieids0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_selfieids0.py deleted file mode 100644 index b5c76804..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_selfieids0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_selfieids0 import implnet_job_selfieids0 - -@schedule(cron_schedule="0 16 26 * *", job=implnet_job_selfieids0, execution_timezone="US/Central") -def implnet_sch_selfieids0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_states0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_states0.py deleted file mode 100644 index ab653cb3..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_states0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_states0 import implnet_job_states0 - -@schedule(cron_schedule="0 0 24 * *", job=implnet_job_states0, execution_timezone="US/Central") -def implnet_sch_states0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_ua100.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_ua100.py deleted file mode 100644 index 07d3c80d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_ua100.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_ua100 import implnet_job_ua100 - -@schedule(cron_schedule="0 20 23 * *", job=implnet_job_ua100, execution_timezone="US/Central") -def implnet_sch_ua100(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade0.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade0.py deleted file mode 100644 index 8cac8291..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade0.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_wade0 import implnet_job_wade0 - -@schedule(cron_schedule="0 0 1 * *", job=implnet_job_wade0, execution_timezone="US/Central") -def implnet_sch_wade0(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade1.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade1.py deleted file mode 100644 index 71a6dd1a..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade1.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_wade1 import implnet_job_wade1 - -@schedule(cron_schedule="0 16 3 * *", job=implnet_job_wade1, execution_timezone="US/Central") -def implnet_sch_wade1(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade10.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade10.py deleted file mode 100644 index 9c25907e..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade10.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_wade10 import implnet_job_wade10 - -@schedule(cron_schedule="0 4 2 * *", job=implnet_job_wade10, execution_timezone="US/Central") -def implnet_sch_wade10(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade11.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade11.py deleted file mode 100644 index 27d16cbf..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade11.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_wade11 import implnet_job_wade11 - -@schedule(cron_schedule="0 0 4 * *", job=implnet_job_wade11, execution_timezone="US/Central") -def implnet_sch_wade11(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade12.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade12.py deleted file mode 100644 index d0857964..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade12.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_wade12 import implnet_job_wade12 - -@schedule(cron_schedule="0 4 3 * *", job=implnet_job_wade12, execution_timezone="US/Central") -def implnet_sch_wade12(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade13.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade13.py deleted file mode 100644 index a148b22d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade13.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_wade13 import implnet_job_wade13 - -@schedule(cron_schedule="0 16 2 * *", job=implnet_job_wade13, execution_timezone="US/Central") -def implnet_sch_wade13(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade14.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade14.py deleted file mode 100644 index f958c5ca..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade14.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_wade14 import implnet_job_wade14 - -@schedule(cron_schedule="0 8 2 * *", job=implnet_job_wade14, execution_timezone="US/Central") -def implnet_sch_wade14(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade15.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade15.py deleted file mode 100644 index ac28e77b..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade15.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_wade15 import implnet_job_wade15 - -@schedule(cron_schedule="0 20 1 * *", job=implnet_job_wade15, execution_timezone="US/Central") -def implnet_sch_wade15(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade16.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade16.py deleted file mode 100644 index 811d3d65..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade16.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_wade16 import implnet_job_wade16 - -@schedule(cron_schedule="0 12 3 * *", job=implnet_job_wade16, execution_timezone="US/Central") -def implnet_sch_wade16(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade17.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade17.py deleted file mode 100644 index a54ef062..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade17.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_wade17 import implnet_job_wade17 - -@schedule(cron_schedule="0 4 1 * *", job=implnet_job_wade17, execution_timezone="US/Central") -def implnet_sch_wade17(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade18.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade18.py deleted file mode 100644 index 70b2eadc..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade18.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_wade18 import implnet_job_wade18 - -@schedule(cron_schedule="0 12 2 * *", job=implnet_job_wade18, execution_timezone="US/Central") -def implnet_sch_wade18(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade19.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade19.py deleted file mode 100644 index 978d2f6e..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade19.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_wade19 import implnet_job_wade19 - -@schedule(cron_schedule="0 0 3 * *", job=implnet_job_wade19, execution_timezone="US/Central") -def implnet_sch_wade19(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade2.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade2.py deleted file mode 100644 index f41ffb68..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade2.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_wade2 import implnet_job_wade2 - -@schedule(cron_schedule="0 20 27 * *", job=implnet_job_wade2, execution_timezone="US/Central") -def implnet_sch_wade2(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade3.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade3.py deleted file mode 100644 index 78c852f3..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade3.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_wade3 import implnet_job_wade3 - -@schedule(cron_schedule="0 16 1 * *", job=implnet_job_wade3, execution_timezone="US/Central") -def implnet_sch_wade3(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade4.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade4.py deleted file mode 100644 index 469a0ea5..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade4.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_wade4 import implnet_job_wade4 - -@schedule(cron_schedule="0 8 3 * *", job=implnet_job_wade4, execution_timezone="US/Central") -def implnet_sch_wade4(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade5.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade5.py deleted file mode 100644 index d64c68c7..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade5.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_wade5 import implnet_job_wade5 - -@schedule(cron_schedule="0 0 2 * *", job=implnet_job_wade5, execution_timezone="US/Central") -def implnet_sch_wade5(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade6.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade6.py deleted file mode 100644 index d6049d0c..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade6.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_wade6 import implnet_job_wade6 - -@schedule(cron_schedule="0 20 3 * *", job=implnet_job_wade6, execution_timezone="US/Central") -def implnet_sch_wade6(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade7.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade7.py deleted file mode 100644 index 2d6c8681..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade7.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_wade7 import implnet_job_wade7 - -@schedule(cron_schedule="0 12 1 * *", job=implnet_job_wade7, execution_timezone="US/Central") -def implnet_sch_wade7(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade8.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade8.py deleted file mode 100644 index de64874b..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade8.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_wade8 import implnet_job_wade8 - -@schedule(cron_schedule="0 20 2 * *", job=implnet_job_wade8, execution_timezone="US/Central") -def implnet_sch_wade8(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade9.py b/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade9.py deleted file mode 100644 index 2188da6d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/sch/implnet_sch_wade9.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_wade9 import implnet_job_wade9 - -@schedule(cron_schedule="0 8 1 * *", job=implnet_job_wade9, execution_timezone="US/Central") -def implnet_sch_wade9(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-iow/output/workspace.yaml b/dagster/implnets/generatedCode/implnet-iow/output/workspace.yaml deleted file mode 100644 index 54490e1d..00000000 --- a/dagster/implnets/generatedCode/implnet-iow/output/workspace.yaml +++ /dev/null @@ -1,4 +0,0 @@ -load_from: - - python_file: - relative_path: "repositories/repository.py" - working_directory: . \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_abacus.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_abacus.py deleted file mode 100644 index 48184735..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_abacus.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_abacus import harvest_abacus - -@job -def implnet_job_abacus(): - harvest_abacus() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_acss.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_acss.py deleted file mode 100644 index 721894c6..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_acss.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_acss import harvest_acss - -@job -def implnet_job_acss(): - harvest_acss() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_adf.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_adf.py deleted file mode 100644 index e63e412d..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_adf.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_adf import harvest_adf - -@job -def implnet_job_adf(): - harvest_adf() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_arecibo.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_arecibo.py deleted file mode 100644 index 7b074a93..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_arecibo.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_arecibo import harvest_arecibo - -@job -def implnet_job_arecibo(): - harvest_arecibo() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_asulrdr.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_asulrdr.py deleted file mode 100644 index 1be0888b..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_asulrdr.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_asulrdr import harvest_asulrdr - -@job -def implnet_job_asulrdr(): - harvest_asulrdr() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_aussda.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_aussda.py deleted file mode 100644 index e832e112..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_aussda.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_aussda import harvest_aussda - -@job -def implnet_job_aussda(): - harvest_aussda() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_aws.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_aws.py deleted file mode 100644 index c3ed2f31..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_aws.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_aws import harvest_aws - -@job -def implnet_job_aws(): - harvest_aws() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_borealis.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_borealis.py deleted file mode 100644 index 70854a55..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_borealis.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_borealis import harvest_borealis - -@job -def implnet_job_borealis(): - harvest_borealis() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_chile.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_chile.py deleted file mode 100644 index c997fbe7..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_chile.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_chile import harvest_chile - -@job -def implnet_job_chile(): - harvest_chile() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cifor.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cifor.py deleted file mode 100644 index 811c5a10..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cifor.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cifor import harvest_cifor - -@job -def implnet_job_cifor(): - harvest_cifor() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cimmyt.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cimmyt.py deleted file mode 100644 index e7cd6da5..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cimmyt.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cimmyt import harvest_cimmyt - -@job -def implnet_job_cimmyt(): - harvest_cimmyt() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cora.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cora.py deleted file mode 100644 index 62a6b827..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cora.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cora import harvest_cora - -@job -def implnet_job_cora(): - harvest_cora() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_crossda.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_crossda.py deleted file mode 100644 index 11e92d10..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_crossda.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_crossda import harvest_crossda - -@job -def implnet_job_crossda(): - harvest_crossda() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cuhk.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cuhk.py deleted file mode 100644 index 1657e65a..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cuhk.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cuhk import harvest_cuhk - -@job -def implnet_job_cuhk(): - harvest_cuhk() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cyvers.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cyvers.py deleted file mode 100644 index 05c83045..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_cyvers.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cyvers import harvest_cyvers - -@job -def implnet_job_cyvers(): - harvest_cyvers() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_darus.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_darus.py deleted file mode 100644 index 45227306..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_darus.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_darus import harvest_darus - -@job -def implnet_job_darus(): - harvest_darus() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_drp.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_drp.py deleted file mode 100644 index 2fd2b3da..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_drp.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_drp import harvest_drp - -@job -def implnet_job_drp(): - harvest_drp() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_dryad.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_dryad.py deleted file mode 100644 index aa6f9ba4..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_dryad.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_dryad import harvest_dryad - -@job -def implnet_job_dryad(): - harvest_dryad() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_edatos.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_edatos.py deleted file mode 100644 index f3a36e6b..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_edatos.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_edatos import harvest_edatos - -@job -def implnet_job_edatos(): - harvest_edatos() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_fiu.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_fiu.py deleted file mode 100644 index 0d97b5f6..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_fiu.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_fiu import harvest_fiu - -@job -def implnet_job_fiu(): - harvest_fiu() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_gro.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_gro.py deleted file mode 100644 index 6d887034..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_gro.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_gro import harvest_gro - -@job -def implnet_job_gro(): - harvest_gro() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_harvard.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_harvard.py deleted file mode 100644 index 064ab7bb..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_harvard.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_harvard import harvest_harvard - -@job -def implnet_job_harvard(): - harvest_harvard() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_hopkins.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_hopkins.py deleted file mode 100644 index 27897b7a..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_hopkins.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_hopkins import harvest_hopkins - -@job -def implnet_job_hopkins(): - harvest_hopkins() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_hord.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_hord.py deleted file mode 100644 index 9b137417..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_hord.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_hord import harvest_hord - -@job -def implnet_job_hord(): - harvest_hord() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ibict.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ibict.py deleted file mode 100644 index 0a619c63..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ibict.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_ibict import harvest_ibict - -@job -def implnet_job_ibict(): - harvest_ibict() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_icarda.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_icarda.py deleted file mode 100644 index df6ee83d..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_icarda.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_icarda import harvest_icarda - -@job -def implnet_job_icarda(): - harvest_icarda() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_icrisat.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_icrisat.py deleted file mode 100644 index ff268ab7..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_icrisat.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_icrisat import harvest_icrisat - -@job -def implnet_job_icrisat(): - harvest_icrisat() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ifdc.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ifdc.py deleted file mode 100644 index 44f0526e..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ifdc.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_ifdc import harvest_ifdc - -@job -def implnet_job_ifdc(): - harvest_ifdc() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ifsttar.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ifsttar.py deleted file mode 100644 index 5a95c4b1..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ifsttar.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_ifsttar import harvest_ifsttar - -@job -def implnet_job_ifsttar(): - harvest_ifsttar() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_iisg.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_iisg.py deleted file mode 100644 index 25b3d7af..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_iisg.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_iisg import harvest_iisg - -@job -def implnet_job_iisg(): - harvest_iisg() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_iit.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_iit.py deleted file mode 100644 index 7bb32b67..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_iit.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_iit import harvest_iit - -@job -def implnet_job_iit(): - harvest_iit() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ipc.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ipc.py deleted file mode 100644 index da470ecf..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ipc.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_ipc import harvest_ipc - -@job -def implnet_job_ipc(): - harvest_ipc() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_irl.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_irl.py deleted file mode 100644 index e47f7f2b..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_irl.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_irl import harvest_irl - -@job -def implnet_job_irl(): - harvest_irl() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_irs.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_irs.py deleted file mode 100644 index f8b1a08d..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_irs.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_irs import harvest_irs - -@job -def implnet_job_irs(): - harvest_irs() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_julich.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_julich.py deleted file mode 100644 index 5d0a29f5..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_julich.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_julich import harvest_julich - -@job -def implnet_job_julich(): - harvest_julich() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_lida.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_lida.py deleted file mode 100644 index f0a0e01b..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_lida.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_lida import harvest_lida - -@job -def implnet_job_lida(): - harvest_lida() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_manitoba.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_manitoba.py deleted file mode 100644 index 9d5e3ad4..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_manitoba.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_manitoba import harvest_manitoba - -@job -def implnet_job_manitoba(): - harvest_manitoba() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_matcommons.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_matcommons.py deleted file mode 100644 index 004f4a00..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_matcommons.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_matcommons import harvest_matcommons - -@job -def implnet_job_matcommons(): - harvest_matcommons() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_mdf.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_mdf.py deleted file mode 100644 index dfb76466..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_mdf.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_mdf import harvest_mdf - -@job -def implnet_job_mdf(): - harvest_mdf() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_milano.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_milano.py deleted file mode 100644 index 70161a3b..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_milano.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_milano import harvest_milano - -@job -def implnet_job_milano(): - harvest_milano() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_neon.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_neon.py deleted file mode 100644 index 403d9f90..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_neon.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_neon import harvest_neon - -@job -def implnet_job_neon(): - harvest_neon() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_netherland.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_netherland.py deleted file mode 100644 index 766266df..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_netherland.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_netherland import harvest_netherland - -@job -def implnet_job_netherland(): - harvest_netherland() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_nioz.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_nioz.py deleted file mode 100644 index a1ddf3f9..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_nioz.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_nioz import harvest_nioz - -@job -def implnet_job_nioz(): - harvest_nioz() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_norway.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_norway.py deleted file mode 100644 index 25a39907..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_norway.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_norway import harvest_norway - -@job -def implnet_job_norway(): - harvest_norway() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ntu.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ntu.py deleted file mode 100644 index e7770f47..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ntu.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_ntu import harvest_ntu - -@job -def implnet_job_ntu(): - harvest_ntu() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ofd.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ofd.py deleted file mode 100644 index 0ffab294..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ofd.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_ofd import harvest_ofd - -@job -def implnet_job_ofd(): - harvest_ofd() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_peking.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_peking.py deleted file mode 100644 index 753ee08f..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_peking.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_peking import harvest_peking - -@job -def implnet_job_peking(): - harvest_peking() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_pesquisa.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_pesquisa.py deleted file mode 100644 index 79aae9e5..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_pesquisa.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_pesquisa import harvest_pesquisa - -@job -def implnet_job_pesquisa(): - harvest_pesquisa() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_pucdp.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_pucdp.py deleted file mode 100644 index 0047c17a..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_pucdp.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_pucdp import harvest_pucdp - -@job -def implnet_job_pucdp(): - harvest_pucdp() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_qdr.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_qdr.py deleted file mode 100644 index 464834ff..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_qdr.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_qdr import harvest_qdr - -@job -def implnet_job_qdr(): - harvest_qdr() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_rin.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_rin.py deleted file mode 100644 index 6332771f..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_rin.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_rin import harvest_rin - -@job -def implnet_job_rin(): - harvest_rin() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_rosario.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_rosario.py deleted file mode 100644 index 6d3e48b3..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_rosario.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_rosario import harvest_rosario - -@job -def implnet_job_rosario(): - harvest_rosario() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_rsu.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_rsu.py deleted file mode 100644 index 3d186f45..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_rsu.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_rsu import harvest_rsu - -@job -def implnet_job_rsu(): - harvest_rsu() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_sceincespo.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_sceincespo.py deleted file mode 100644 index 50936c4d..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_sceincespo.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_sceincespo import harvest_sceincespo - -@job -def implnet_job_sceincespo(): - harvest_sceincespo() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_tdi.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_tdi.py deleted file mode 100644 index da347896..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_tdi.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_tdi import harvest_tdi - -@job -def implnet_job_tdi(): - harvest_tdi() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_tdl.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_tdl.py deleted file mode 100644 index 152561b9..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_tdl.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_tdl import harvest_tdl - -@job -def implnet_job_tdl(): - harvest_tdl() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ucdl.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ucdl.py deleted file mode 100644 index 644d7e03..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ucdl.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_ucdl import harvest_ucdl - -@job -def implnet_job_ucdl(): - harvest_ucdl() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ucla.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ucla.py deleted file mode 100644 index 656bce58..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_ucla.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_ucla import harvest_ucla - -@job -def implnet_job_ucla(): - harvest_ucla() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_unb.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_unb.py deleted file mode 100644 index 004c47b0..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_unb.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_unb import harvest_unb - -@job -def implnet_job_unb(): - harvest_unb() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_unc.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_unc.py deleted file mode 100644 index 2d536d65..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_unc.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_unc import harvest_unc - -@job -def implnet_job_unc(): - harvest_unc() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_uva.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_uva.py deleted file mode 100644 index e4dbcc77..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_uva.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_uva import harvest_uva - -@job -def implnet_job_uva(): - harvest_uva() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_uwi.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_uwi.py deleted file mode 100644 index 2f465b74..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_uwi.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_uwi import harvest_uwi - -@job -def implnet_job_uwi(): - harvest_uwi() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_vtti.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_vtti.py deleted file mode 100644 index 416546d7..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_vtti.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_vtti import harvest_vtti - -@job -def implnet_job_vtti(): - harvest_vtti() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_wardr.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_wardr.py deleted file mode 100644 index cd0a591c..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_wardr.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_wardr import harvest_wardr - -@job -def implnet_job_wardr(): - harvest_wardr() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_yalenus.py b/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_yalenus.py deleted file mode 100644 index 89ae4aea..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/jobs/implnet_jobs_yalenus.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_yalenus import harvest_yalenus - -@job -def implnet_job_yalenus(): - harvest_yalenus() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_abacus.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_abacus.py deleted file mode 100644 index 761e2018..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_abacus.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def abacus_gleaner(context): - returned_value = gleanerio(("gleaner"), "abacus") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def abacus_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "abacus") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def abacus_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "abacus") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def abacus_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "abacus") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def abacus_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "abacus") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_abacus(): - harvest = abacus_gleaner() - load1 = abacus_nabu(harvest) - load2 = abacus_nabuprov(load1) - load3 = abacus_nabuorg(load2) - load4 = abacus_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_acss.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_acss.py deleted file mode 100644 index 93f29b71..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_acss.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def acss_gleaner(context): - returned_value = gleanerio(("gleaner"), "acss") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def acss_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "acss") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def acss_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "acss") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def acss_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "acss") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def acss_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "acss") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_acss(): - harvest = acss_gleaner() - load1 = acss_nabu(harvest) - load2 = acss_nabuprov(load1) - load3 = acss_nabuorg(load2) - load4 = acss_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_adf.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_adf.py deleted file mode 100644 index 36bf33e9..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_adf.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def adf_gleaner(context): - returned_value = gleanerio(("gleaner"), "adf") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def adf_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "adf") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def adf_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "adf") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def adf_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "adf") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def adf_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "adf") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_adf(): - harvest = adf_gleaner() - load1 = adf_nabu(harvest) - load2 = adf_nabuprov(load1) - load3 = adf_nabuorg(load2) - load4 = adf_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_arecibo.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_arecibo.py deleted file mode 100644 index 049e266a..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_arecibo.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def arecibo_gleaner(context): - returned_value = gleanerio(("gleaner"), "arecibo") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def arecibo_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "arecibo") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def arecibo_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "arecibo") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def arecibo_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "arecibo") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def arecibo_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "arecibo") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_arecibo(): - harvest = arecibo_gleaner() - load1 = arecibo_nabu(harvest) - load2 = arecibo_nabuprov(load1) - load3 = arecibo_nabuorg(load2) - load4 = arecibo_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_asulrdr.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_asulrdr.py deleted file mode 100644 index 4091e128..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_asulrdr.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def asulrdr_gleaner(context): - returned_value = gleanerio(("gleaner"), "asulrdr") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def asulrdr_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "asulrdr") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def asulrdr_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "asulrdr") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def asulrdr_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "asulrdr") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def asulrdr_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "asulrdr") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_asulrdr(): - harvest = asulrdr_gleaner() - load1 = asulrdr_nabu(harvest) - load2 = asulrdr_nabuprov(load1) - load3 = asulrdr_nabuorg(load2) - load4 = asulrdr_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_aussda.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_aussda.py deleted file mode 100644 index 4b9d1849..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_aussda.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def aussda_gleaner(context): - returned_value = gleanerio(("gleaner"), "aussda") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def aussda_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "aussda") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def aussda_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "aussda") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def aussda_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "aussda") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def aussda_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "aussda") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_aussda(): - harvest = aussda_gleaner() - load1 = aussda_nabu(harvest) - load2 = aussda_nabuprov(load1) - load3 = aussda_nabuorg(load2) - load4 = aussda_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_aws.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_aws.py deleted file mode 100644 index 83b1a3f3..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_aws.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def aws_gleaner(context): - returned_value = gleanerio(("gleaner"), "aws") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def aws_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "aws") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def aws_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "aws") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def aws_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "aws") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def aws_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "aws") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_aws(): - harvest = aws_gleaner() - load1 = aws_nabu(harvest) - load2 = aws_nabuprov(load1) - load3 = aws_nabuorg(load2) - load4 = aws_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_borealis.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_borealis.py deleted file mode 100644 index b3e3b566..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_borealis.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def borealis_gleaner(context): - returned_value = gleanerio(("gleaner"), "borealis") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def borealis_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "borealis") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def borealis_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "borealis") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def borealis_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "borealis") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def borealis_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "borealis") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_borealis(): - harvest = borealis_gleaner() - load1 = borealis_nabu(harvest) - load2 = borealis_nabuprov(load1) - load3 = borealis_nabuorg(load2) - load4 = borealis_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_chile.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_chile.py deleted file mode 100644 index 7a66d0c9..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_chile.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def chile_gleaner(context): - returned_value = gleanerio(("gleaner"), "chile") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def chile_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "chile") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def chile_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "chile") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def chile_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "chile") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def chile_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "chile") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_chile(): - harvest = chile_gleaner() - load1 = chile_nabu(harvest) - load2 = chile_nabuprov(load1) - load3 = chile_nabuorg(load2) - load4 = chile_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cifor.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cifor.py deleted file mode 100644 index c78bdd93..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cifor.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def cifor_gleaner(context): - returned_value = gleanerio(("gleaner"), "cifor") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def cifor_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "cifor") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def cifor_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "cifor") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def cifor_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "cifor") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def cifor_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "cifor") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_cifor(): - harvest = cifor_gleaner() - load1 = cifor_nabu(harvest) - load2 = cifor_nabuprov(load1) - load3 = cifor_nabuorg(load2) - load4 = cifor_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cimmyt.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cimmyt.py deleted file mode 100644 index dba0a8a8..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cimmyt.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def cimmyt_gleaner(context): - returned_value = gleanerio(("gleaner"), "cimmyt") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def cimmyt_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "cimmyt") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def cimmyt_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "cimmyt") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def cimmyt_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "cimmyt") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def cimmyt_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "cimmyt") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_cimmyt(): - harvest = cimmyt_gleaner() - load1 = cimmyt_nabu(harvest) - load2 = cimmyt_nabuprov(load1) - load3 = cimmyt_nabuorg(load2) - load4 = cimmyt_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cora.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cora.py deleted file mode 100644 index ebb0ae4d..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cora.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def cora_gleaner(context): - returned_value = gleanerio(("gleaner"), "cora") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def cora_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "cora") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def cora_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "cora") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def cora_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "cora") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def cora_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "cora") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_cora(): - harvest = cora_gleaner() - load1 = cora_nabu(harvest) - load2 = cora_nabuprov(load1) - load3 = cora_nabuorg(load2) - load4 = cora_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_crossda.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_crossda.py deleted file mode 100644 index 7520c9f1..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_crossda.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def crossda_gleaner(context): - returned_value = gleanerio(("gleaner"), "crossda") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def crossda_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "crossda") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def crossda_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "crossda") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def crossda_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "crossda") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def crossda_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "crossda") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_crossda(): - harvest = crossda_gleaner() - load1 = crossda_nabu(harvest) - load2 = crossda_nabuprov(load1) - load3 = crossda_nabuorg(load2) - load4 = crossda_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cuhk.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cuhk.py deleted file mode 100644 index 43ff574e..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cuhk.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def cuhk_gleaner(context): - returned_value = gleanerio(("gleaner"), "cuhk") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def cuhk_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "cuhk") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def cuhk_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "cuhk") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def cuhk_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "cuhk") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def cuhk_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "cuhk") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_cuhk(): - harvest = cuhk_gleaner() - load1 = cuhk_nabu(harvest) - load2 = cuhk_nabuprov(load1) - load3 = cuhk_nabuorg(load2) - load4 = cuhk_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cyvers.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cyvers.py deleted file mode 100644 index 987064a0..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_cyvers.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def cyvers_gleaner(context): - returned_value = gleanerio(("gleaner"), "cyvers") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def cyvers_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "cyvers") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def cyvers_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "cyvers") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def cyvers_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "cyvers") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def cyvers_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "cyvers") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_cyvers(): - harvest = cyvers_gleaner() - load1 = cyvers_nabu(harvest) - load2 = cyvers_nabuprov(load1) - load3 = cyvers_nabuorg(load2) - load4 = cyvers_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_darus.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_darus.py deleted file mode 100644 index 27a84d3a..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_darus.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def darus_gleaner(context): - returned_value = gleanerio(("gleaner"), "darus") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def darus_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "darus") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def darus_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "darus") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def darus_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "darus") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def darus_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "darus") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_darus(): - harvest = darus_gleaner() - load1 = darus_nabu(harvest) - load2 = darus_nabuprov(load1) - load3 = darus_nabuorg(load2) - load4 = darus_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_drp.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_drp.py deleted file mode 100644 index a4b1f6f9..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_drp.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def drp_gleaner(context): - returned_value = gleanerio(("gleaner"), "drp") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def drp_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "drp") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def drp_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "drp") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def drp_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "drp") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def drp_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "drp") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_drp(): - harvest = drp_gleaner() - load1 = drp_nabu(harvest) - load2 = drp_nabuprov(load1) - load3 = drp_nabuorg(load2) - load4 = drp_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_dryad.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_dryad.py deleted file mode 100644 index 05050404..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_dryad.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def dryad_gleaner(context): - returned_value = gleanerio(("gleaner"), "dryad") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def dryad_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "dryad") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def dryad_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "dryad") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def dryad_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "dryad") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def dryad_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "dryad") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_dryad(): - harvest = dryad_gleaner() - load1 = dryad_nabu(harvest) - load2 = dryad_nabuprov(load1) - load3 = dryad_nabuorg(load2) - load4 = dryad_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_edatos.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_edatos.py deleted file mode 100644 index 3fab6345..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_edatos.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def edatos_gleaner(context): - returned_value = gleanerio(("gleaner"), "edatos") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def edatos_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "edatos") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def edatos_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "edatos") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def edatos_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "edatos") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def edatos_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "edatos") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_edatos(): - harvest = edatos_gleaner() - load1 = edatos_nabu(harvest) - load2 = edatos_nabuprov(load1) - load3 = edatos_nabuorg(load2) - load4 = edatos_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_fiu.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_fiu.py deleted file mode 100644 index 446900c1..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_fiu.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def fiu_gleaner(context): - returned_value = gleanerio(("gleaner"), "fiu") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def fiu_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "fiu") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def fiu_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "fiu") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def fiu_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "fiu") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def fiu_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "fiu") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_fiu(): - harvest = fiu_gleaner() - load1 = fiu_nabu(harvest) - load2 = fiu_nabuprov(load1) - load3 = fiu_nabuorg(load2) - load4 = fiu_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_gro.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_gro.py deleted file mode 100644 index 577780a1..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_gro.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def gro_gleaner(context): - returned_value = gleanerio(("gleaner"), "gro") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def gro_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "gro") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def gro_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "gro") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def gro_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "gro") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def gro_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "gro") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_gro(): - harvest = gro_gleaner() - load1 = gro_nabu(harvest) - load2 = gro_nabuprov(load1) - load3 = gro_nabuorg(load2) - load4 = gro_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_harvard.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_harvard.py deleted file mode 100644 index fc6631df..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_harvard.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def harvard_gleaner(context): - returned_value = gleanerio(("gleaner"), "harvard") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def harvard_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "harvard") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def harvard_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "harvard") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def harvard_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "harvard") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def harvard_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "harvard") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_harvard(): - harvest = harvard_gleaner() - load1 = harvard_nabu(harvest) - load2 = harvard_nabuprov(load1) - load3 = harvard_nabuorg(load2) - load4 = harvard_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_hopkins.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_hopkins.py deleted file mode 100644 index 0c0f5ab0..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_hopkins.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def hopkins_gleaner(context): - returned_value = gleanerio(("gleaner"), "hopkins") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def hopkins_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "hopkins") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def hopkins_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "hopkins") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def hopkins_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "hopkins") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def hopkins_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "hopkins") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_hopkins(): - harvest = hopkins_gleaner() - load1 = hopkins_nabu(harvest) - load2 = hopkins_nabuprov(load1) - load3 = hopkins_nabuorg(load2) - load4 = hopkins_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_hord.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_hord.py deleted file mode 100644 index f10fa238..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_hord.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def hord_gleaner(context): - returned_value = gleanerio(("gleaner"), "hord") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def hord_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "hord") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def hord_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "hord") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def hord_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "hord") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def hord_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "hord") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_hord(): - harvest = hord_gleaner() - load1 = hord_nabu(harvest) - load2 = hord_nabuprov(load1) - load3 = hord_nabuorg(load2) - load4 = hord_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ibict.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ibict.py deleted file mode 100644 index 4eaceeb6..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ibict.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def ibict_gleaner(context): - returned_value = gleanerio(("gleaner"), "ibict") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def ibict_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "ibict") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ibict_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "ibict") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ibict_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "ibict") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ibict_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "ibict") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_ibict(): - harvest = ibict_gleaner() - load1 = ibict_nabu(harvest) - load2 = ibict_nabuprov(load1) - load3 = ibict_nabuorg(load2) - load4 = ibict_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_icarda.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_icarda.py deleted file mode 100644 index 1b0f8e74..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_icarda.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def icarda_gleaner(context): - returned_value = gleanerio(("gleaner"), "icarda") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def icarda_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "icarda") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def icarda_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "icarda") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def icarda_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "icarda") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def icarda_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "icarda") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_icarda(): - harvest = icarda_gleaner() - load1 = icarda_nabu(harvest) - load2 = icarda_nabuprov(load1) - load3 = icarda_nabuorg(load2) - load4 = icarda_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_icrisat.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_icrisat.py deleted file mode 100644 index dd5832dc..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_icrisat.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def icrisat_gleaner(context): - returned_value = gleanerio(("gleaner"), "icrisat") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def icrisat_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "icrisat") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def icrisat_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "icrisat") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def icrisat_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "icrisat") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def icrisat_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "icrisat") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_icrisat(): - harvest = icrisat_gleaner() - load1 = icrisat_nabu(harvest) - load2 = icrisat_nabuprov(load1) - load3 = icrisat_nabuorg(load2) - load4 = icrisat_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ifdc.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ifdc.py deleted file mode 100644 index 9bb6fd95..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ifdc.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def ifdc_gleaner(context): - returned_value = gleanerio(("gleaner"), "ifdc") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def ifdc_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "ifdc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ifdc_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "ifdc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ifdc_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "ifdc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ifdc_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "ifdc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_ifdc(): - harvest = ifdc_gleaner() - load1 = ifdc_nabu(harvest) - load2 = ifdc_nabuprov(load1) - load3 = ifdc_nabuorg(load2) - load4 = ifdc_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ifsttar.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ifsttar.py deleted file mode 100644 index 7b75704d..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ifsttar.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def ifsttar_gleaner(context): - returned_value = gleanerio(("gleaner"), "ifsttar") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def ifsttar_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "ifsttar") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ifsttar_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "ifsttar") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ifsttar_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "ifsttar") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ifsttar_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "ifsttar") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_ifsttar(): - harvest = ifsttar_gleaner() - load1 = ifsttar_nabu(harvest) - load2 = ifsttar_nabuprov(load1) - load3 = ifsttar_nabuorg(load2) - load4 = ifsttar_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_iisg.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_iisg.py deleted file mode 100644 index e128fbfb..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_iisg.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def iisg_gleaner(context): - returned_value = gleanerio(("gleaner"), "iisg") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def iisg_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "iisg") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def iisg_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "iisg") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def iisg_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "iisg") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def iisg_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "iisg") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_iisg(): - harvest = iisg_gleaner() - load1 = iisg_nabu(harvest) - load2 = iisg_nabuprov(load1) - load3 = iisg_nabuorg(load2) - load4 = iisg_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_iit.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_iit.py deleted file mode 100644 index 9939a812..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_iit.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def iit_gleaner(context): - returned_value = gleanerio(("gleaner"), "iit") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def iit_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "iit") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def iit_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "iit") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def iit_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "iit") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def iit_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "iit") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_iit(): - harvest = iit_gleaner() - load1 = iit_nabu(harvest) - load2 = iit_nabuprov(load1) - load3 = iit_nabuorg(load2) - load4 = iit_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ipc.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ipc.py deleted file mode 100644 index c7558d0c..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ipc.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def ipc_gleaner(context): - returned_value = gleanerio(("gleaner"), "ipc") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def ipc_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "ipc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ipc_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "ipc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ipc_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "ipc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ipc_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "ipc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_ipc(): - harvest = ipc_gleaner() - load1 = ipc_nabu(harvest) - load2 = ipc_nabuprov(load1) - load3 = ipc_nabuorg(load2) - load4 = ipc_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_irl.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_irl.py deleted file mode 100644 index 53b10ae5..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_irl.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def irl_gleaner(context): - returned_value = gleanerio(("gleaner"), "irl") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def irl_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "irl") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def irl_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "irl") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def irl_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "irl") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def irl_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "irl") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_irl(): - harvest = irl_gleaner() - load1 = irl_nabu(harvest) - load2 = irl_nabuprov(load1) - load3 = irl_nabuorg(load2) - load4 = irl_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_irs.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_irs.py deleted file mode 100644 index 8449f403..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_irs.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def irs_gleaner(context): - returned_value = gleanerio(("gleaner"), "irs") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def irs_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "irs") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def irs_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "irs") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def irs_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "irs") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def irs_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "irs") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_irs(): - harvest = irs_gleaner() - load1 = irs_nabu(harvest) - load2 = irs_nabuprov(load1) - load3 = irs_nabuorg(load2) - load4 = irs_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_julich.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_julich.py deleted file mode 100644 index a7d3cddc..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_julich.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def julich_gleaner(context): - returned_value = gleanerio(("gleaner"), "julich") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def julich_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "julich") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def julich_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "julich") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def julich_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "julich") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def julich_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "julich") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_julich(): - harvest = julich_gleaner() - load1 = julich_nabu(harvest) - load2 = julich_nabuprov(load1) - load3 = julich_nabuorg(load2) - load4 = julich_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_lida.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_lida.py deleted file mode 100644 index d37854c8..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_lida.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def lida_gleaner(context): - returned_value = gleanerio(("gleaner"), "lida") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def lida_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "lida") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def lida_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "lida") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def lida_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "lida") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def lida_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "lida") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_lida(): - harvest = lida_gleaner() - load1 = lida_nabu(harvest) - load2 = lida_nabuprov(load1) - load3 = lida_nabuorg(load2) - load4 = lida_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_manitoba.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_manitoba.py deleted file mode 100644 index ec4077d7..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_manitoba.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def manitoba_gleaner(context): - returned_value = gleanerio(("gleaner"), "manitoba") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def manitoba_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "manitoba") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def manitoba_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "manitoba") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def manitoba_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "manitoba") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def manitoba_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "manitoba") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_manitoba(): - harvest = manitoba_gleaner() - load1 = manitoba_nabu(harvest) - load2 = manitoba_nabuprov(load1) - load3 = manitoba_nabuorg(load2) - load4 = manitoba_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_matcommons.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_matcommons.py deleted file mode 100644 index 1f8516de..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_matcommons.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def matcommons_gleaner(context): - returned_value = gleanerio(("gleaner"), "matcommons") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def matcommons_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "matcommons") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def matcommons_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "matcommons") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def matcommons_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "matcommons") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def matcommons_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "matcommons") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_matcommons(): - harvest = matcommons_gleaner() - load1 = matcommons_nabu(harvest) - load2 = matcommons_nabuprov(load1) - load3 = matcommons_nabuorg(load2) - load4 = matcommons_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_mdf.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_mdf.py deleted file mode 100644 index 30660773..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_mdf.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def mdf_gleaner(context): - returned_value = gleanerio(("gleaner"), "mdf") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def mdf_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "mdf") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def mdf_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "mdf") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def mdf_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "mdf") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def mdf_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "mdf") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_mdf(): - harvest = mdf_gleaner() - load1 = mdf_nabu(harvest) - load2 = mdf_nabuprov(load1) - load3 = mdf_nabuorg(load2) - load4 = mdf_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_milano.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_milano.py deleted file mode 100644 index 4460a79a..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_milano.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def milano_gleaner(context): - returned_value = gleanerio(("gleaner"), "milano") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def milano_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "milano") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def milano_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "milano") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def milano_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "milano") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def milano_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "milano") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_milano(): - harvest = milano_gleaner() - load1 = milano_nabu(harvest) - load2 = milano_nabuprov(load1) - load3 = milano_nabuorg(load2) - load4 = milano_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_neon.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_neon.py deleted file mode 100644 index 8d300b6a..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_neon.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def neon_gleaner(context): - returned_value = gleanerio(("gleaner"), "neon") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def neon_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "neon") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def neon_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "neon") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def neon_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "neon") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def neon_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "neon") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_neon(): - harvest = neon_gleaner() - load1 = neon_nabu(harvest) - load2 = neon_nabuprov(load1) - load3 = neon_nabuorg(load2) - load4 = neon_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_netherland.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_netherland.py deleted file mode 100644 index 7e73cb88..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_netherland.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def netherland_gleaner(context): - returned_value = gleanerio(("gleaner"), "netherland") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def netherland_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "netherland") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def netherland_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "netherland") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def netherland_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "netherland") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def netherland_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "netherland") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_netherland(): - harvest = netherland_gleaner() - load1 = netherland_nabu(harvest) - load2 = netherland_nabuprov(load1) - load3 = netherland_nabuorg(load2) - load4 = netherland_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_nioz.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_nioz.py deleted file mode 100644 index 8a63dac1..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_nioz.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def nioz_gleaner(context): - returned_value = gleanerio(("gleaner"), "nioz") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def nioz_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "nioz") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def nioz_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "nioz") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def nioz_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "nioz") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def nioz_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "nioz") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_nioz(): - harvest = nioz_gleaner() - load1 = nioz_nabu(harvest) - load2 = nioz_nabuprov(load1) - load3 = nioz_nabuorg(load2) - load4 = nioz_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_norway.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_norway.py deleted file mode 100644 index 777318c6..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_norway.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def norway_gleaner(context): - returned_value = gleanerio(("gleaner"), "norway") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def norway_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "norway") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def norway_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "norway") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def norway_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "norway") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def norway_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "norway") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_norway(): - harvest = norway_gleaner() - load1 = norway_nabu(harvest) - load2 = norway_nabuprov(load1) - load3 = norway_nabuorg(load2) - load4 = norway_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ntu.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ntu.py deleted file mode 100644 index e5fbe902..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ntu.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def ntu_gleaner(context): - returned_value = gleanerio(("gleaner"), "ntu") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def ntu_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "ntu") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ntu_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "ntu") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ntu_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "ntu") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ntu_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "ntu") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_ntu(): - harvest = ntu_gleaner() - load1 = ntu_nabu(harvest) - load2 = ntu_nabuprov(load1) - load3 = ntu_nabuorg(load2) - load4 = ntu_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ofd.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ofd.py deleted file mode 100644 index 6ad1294a..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ofd.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def ofd_gleaner(context): - returned_value = gleanerio(("gleaner"), "ofd") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def ofd_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "ofd") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ofd_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "ofd") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ofd_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "ofd") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ofd_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "ofd") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_ofd(): - harvest = ofd_gleaner() - load1 = ofd_nabu(harvest) - load2 = ofd_nabuprov(load1) - load3 = ofd_nabuorg(load2) - load4 = ofd_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_peking.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_peking.py deleted file mode 100644 index e32c7fd8..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_peking.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def peking_gleaner(context): - returned_value = gleanerio(("gleaner"), "peking") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def peking_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "peking") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def peking_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "peking") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def peking_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "peking") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def peking_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "peking") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_peking(): - harvest = peking_gleaner() - load1 = peking_nabu(harvest) - load2 = peking_nabuprov(load1) - load3 = peking_nabuorg(load2) - load4 = peking_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_pesquisa.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_pesquisa.py deleted file mode 100644 index be22fa7e..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_pesquisa.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def pesquisa_gleaner(context): - returned_value = gleanerio(("gleaner"), "pesquisa") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def pesquisa_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "pesquisa") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def pesquisa_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "pesquisa") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def pesquisa_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "pesquisa") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def pesquisa_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "pesquisa") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_pesquisa(): - harvest = pesquisa_gleaner() - load1 = pesquisa_nabu(harvest) - load2 = pesquisa_nabuprov(load1) - load3 = pesquisa_nabuorg(load2) - load4 = pesquisa_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_pucdp.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_pucdp.py deleted file mode 100644 index 4ccc7107..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_pucdp.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def pucdp_gleaner(context): - returned_value = gleanerio(("gleaner"), "pucdp") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def pucdp_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "pucdp") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def pucdp_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "pucdp") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def pucdp_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "pucdp") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def pucdp_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "pucdp") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_pucdp(): - harvest = pucdp_gleaner() - load1 = pucdp_nabu(harvest) - load2 = pucdp_nabuprov(load1) - load3 = pucdp_nabuorg(load2) - load4 = pucdp_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_qdr.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_qdr.py deleted file mode 100644 index 09aaa3cb..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_qdr.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def qdr_gleaner(context): - returned_value = gleanerio(("gleaner"), "qdr") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def qdr_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "qdr") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def qdr_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "qdr") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def qdr_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "qdr") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def qdr_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "qdr") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_qdr(): - harvest = qdr_gleaner() - load1 = qdr_nabu(harvest) - load2 = qdr_nabuprov(load1) - load3 = qdr_nabuorg(load2) - load4 = qdr_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_rin.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_rin.py deleted file mode 100644 index cf50affb..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_rin.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def rin_gleaner(context): - returned_value = gleanerio(("gleaner"), "rin") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def rin_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "rin") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def rin_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "rin") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def rin_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "rin") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def rin_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "rin") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_rin(): - harvest = rin_gleaner() - load1 = rin_nabu(harvest) - load2 = rin_nabuprov(load1) - load3 = rin_nabuorg(load2) - load4 = rin_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_rosario.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_rosario.py deleted file mode 100644 index 8a9f398e..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_rosario.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def rosario_gleaner(context): - returned_value = gleanerio(("gleaner"), "rosario") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def rosario_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "rosario") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def rosario_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "rosario") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def rosario_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "rosario") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def rosario_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "rosario") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_rosario(): - harvest = rosario_gleaner() - load1 = rosario_nabu(harvest) - load2 = rosario_nabuprov(load1) - load3 = rosario_nabuorg(load2) - load4 = rosario_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_rsu.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_rsu.py deleted file mode 100644 index b3f9693d..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_rsu.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def rsu_gleaner(context): - returned_value = gleanerio(("gleaner"), "rsu") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def rsu_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "rsu") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def rsu_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "rsu") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def rsu_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "rsu") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def rsu_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "rsu") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_rsu(): - harvest = rsu_gleaner() - load1 = rsu_nabu(harvest) - load2 = rsu_nabuprov(load1) - load3 = rsu_nabuorg(load2) - load4 = rsu_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_sceincespo.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_sceincespo.py deleted file mode 100644 index 1c15a43e..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_sceincespo.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def sceincespo_gleaner(context): - returned_value = gleanerio(("gleaner"), "sceincespo") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def sceincespo_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "sceincespo") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def sceincespo_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "sceincespo") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def sceincespo_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "sceincespo") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def sceincespo_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "sceincespo") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_sceincespo(): - harvest = sceincespo_gleaner() - load1 = sceincespo_nabu(harvest) - load2 = sceincespo_nabuprov(load1) - load3 = sceincespo_nabuorg(load2) - load4 = sceincespo_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_tdi.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_tdi.py deleted file mode 100644 index d6bd59ed..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_tdi.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def tdi_gleaner(context): - returned_value = gleanerio(("gleaner"), "tdi") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def tdi_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "tdi") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def tdi_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "tdi") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def tdi_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "tdi") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def tdi_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "tdi") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_tdi(): - harvest = tdi_gleaner() - load1 = tdi_nabu(harvest) - load2 = tdi_nabuprov(load1) - load3 = tdi_nabuorg(load2) - load4 = tdi_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_tdl.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_tdl.py deleted file mode 100644 index 344d5c8e..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_tdl.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def tdl_gleaner(context): - returned_value = gleanerio(("gleaner"), "tdl") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def tdl_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "tdl") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def tdl_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "tdl") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def tdl_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "tdl") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def tdl_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "tdl") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_tdl(): - harvest = tdl_gleaner() - load1 = tdl_nabu(harvest) - load2 = tdl_nabuprov(load1) - load3 = tdl_nabuorg(load2) - load4 = tdl_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ucdl.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ucdl.py deleted file mode 100644 index 69b8ffea..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ucdl.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def ucdl_gleaner(context): - returned_value = gleanerio(("gleaner"), "ucdl") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def ucdl_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "ucdl") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ucdl_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "ucdl") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ucdl_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "ucdl") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ucdl_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "ucdl") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_ucdl(): - harvest = ucdl_gleaner() - load1 = ucdl_nabu(harvest) - load2 = ucdl_nabuprov(load1) - load3 = ucdl_nabuorg(load2) - load4 = ucdl_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ucla.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ucla.py deleted file mode 100644 index c9311d11..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_ucla.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def ucla_gleaner(context): - returned_value = gleanerio(("gleaner"), "ucla") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def ucla_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "ucla") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ucla_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "ucla") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ucla_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "ucla") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ucla_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "ucla") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_ucla(): - harvest = ucla_gleaner() - load1 = ucla_nabu(harvest) - load2 = ucla_nabuprov(load1) - load3 = ucla_nabuorg(load2) - load4 = ucla_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_unb.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_unb.py deleted file mode 100644 index be9c2bfb..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_unb.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def unb_gleaner(context): - returned_value = gleanerio(("gleaner"), "unb") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def unb_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "unb") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def unb_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "unb") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def unb_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "unb") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def unb_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "unb") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_unb(): - harvest = unb_gleaner() - load1 = unb_nabu(harvest) - load2 = unb_nabuprov(load1) - load3 = unb_nabuorg(load2) - load4 = unb_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_unc.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_unc.py deleted file mode 100644 index e28ebfff..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_unc.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def unc_gleaner(context): - returned_value = gleanerio(("gleaner"), "unc") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def unc_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "unc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def unc_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "unc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def unc_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "unc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def unc_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "unc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_unc(): - harvest = unc_gleaner() - load1 = unc_nabu(harvest) - load2 = unc_nabuprov(load1) - load3 = unc_nabuorg(load2) - load4 = unc_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_uva.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_uva.py deleted file mode 100644 index de81e633..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_uva.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def uva_gleaner(context): - returned_value = gleanerio(("gleaner"), "uva") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def uva_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "uva") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def uva_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "uva") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def uva_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "uva") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def uva_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "uva") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_uva(): - harvest = uva_gleaner() - load1 = uva_nabu(harvest) - load2 = uva_nabuprov(load1) - load3 = uva_nabuorg(load2) - load4 = uva_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_uwi.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_uwi.py deleted file mode 100644 index 19917adf..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_uwi.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def uwi_gleaner(context): - returned_value = gleanerio(("gleaner"), "uwi") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def uwi_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "uwi") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def uwi_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "uwi") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def uwi_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "uwi") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def uwi_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "uwi") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_uwi(): - harvest = uwi_gleaner() - load1 = uwi_nabu(harvest) - load2 = uwi_nabuprov(load1) - load3 = uwi_nabuorg(load2) - load4 = uwi_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_vtti.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_vtti.py deleted file mode 100644 index 2e73d3f2..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_vtti.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def vtti_gleaner(context): - returned_value = gleanerio(("gleaner"), "vtti") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def vtti_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "vtti") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def vtti_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "vtti") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def vtti_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "vtti") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def vtti_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "vtti") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_vtti(): - harvest = vtti_gleaner() - load1 = vtti_nabu(harvest) - load2 = vtti_nabuprov(load1) - load3 = vtti_nabuorg(load2) - load4 = vtti_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_wardr.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_wardr.py deleted file mode 100644 index ccf5be3b..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_wardr.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def wardr_gleaner(context): - returned_value = gleanerio(("gleaner"), "wardr") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def wardr_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "wardr") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def wardr_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "wardr") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def wardr_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "wardr") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def wardr_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "wardr") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_wardr(): - harvest = wardr_gleaner() - load1 = wardr_nabu(harvest) - load2 = wardr_nabuprov(load1) - load3 = wardr_nabuorg(load2) - load4 = wardr_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_yalenus.py b/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_yalenus.py deleted file mode 100644 index 301b6b11..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/ops/implnet_ops_yalenus.py +++ /dev/null @@ -1,304 +0,0 @@ -import distutils - -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_URL'))}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(os.environ.get('GLEANER_MINIO_BUCKET'))}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def yalenus_gleaner(context): - returned_value = gleanerio(("gleaner"), "yalenus") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def yalenus_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "yalenus") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def yalenus_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "yalenus") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def yalenus_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "yalenus") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def yalenus_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "yalenus") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_yalenus(): - harvest = yalenus_gleaner() - load1 = yalenus_nabu(harvest) - load2 = yalenus_nabuprov(load1) - load3 = yalenus_nabuorg(load2) - load4 = yalenus_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/repositories/repository.py b/dagster/implnets/generatedCode/implnet-nsdf/output/repositories/repository.py deleted file mode 100644 index 8238ee49..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/repositories/repository.py +++ /dev/null @@ -1,139 +0,0 @@ -from dagster import repository -from jobs.implnet_jobs_arecibo import implnet_job_arecibo -from sch.implnet_sch_arecibo import implnet_sch_arecibo -from jobs.implnet_jobs_aws import implnet_job_aws -from sch.implnet_sch_aws import implnet_sch_aws -from jobs.implnet_jobs_cyvers import implnet_job_cyvers -from sch.implnet_sch_cyvers import implnet_sch_cyvers -from jobs.implnet_jobs_drp import implnet_job_drp -from sch.implnet_sch_drp import implnet_sch_drp -from jobs.implnet_jobs_dryad import implnet_job_dryad -from sch.implnet_sch_dryad import implnet_sch_dryad -from jobs.implnet_jobs_matcommons import implnet_job_matcommons -from sch.implnet_sch_matcommons import implnet_sch_matcommons -from jobs.implnet_jobs_mdf import implnet_job_mdf -from sch.implnet_sch_mdf import implnet_sch_mdf -from jobs.implnet_jobs_neon import implnet_job_neon -from sch.implnet_sch_neon import implnet_sch_neon -from jobs.implnet_jobs_abacus import implnet_job_abacus -from sch.implnet_sch_abacus import implnet_sch_abacus -from jobs.implnet_jobs_acss import implnet_job_acss -from sch.implnet_sch_acss import implnet_sch_acss -from jobs.implnet_jobs_adf import implnet_job_adf -from sch.implnet_sch_adf import implnet_sch_adf -from jobs.implnet_jobs_asulrdr import implnet_job_asulrdr -from sch.implnet_sch_asulrdr import implnet_sch_asulrdr -from jobs.implnet_jobs_aussda import implnet_job_aussda -from sch.implnet_sch_aussda import implnet_sch_aussda -from jobs.implnet_jobs_borealis import implnet_job_borealis -from sch.implnet_sch_borealis import implnet_sch_borealis -from jobs.implnet_jobs_cifor import implnet_job_cifor -from sch.implnet_sch_cifor import implnet_sch_cifor -from jobs.implnet_jobs_cimmyt import implnet_job_cimmyt -from sch.implnet_sch_cimmyt import implnet_sch_cimmyt -from jobs.implnet_jobs_cora import implnet_job_cora -from sch.implnet_sch_cora import implnet_sch_cora -from jobs.implnet_jobs_crossda import implnet_job_crossda -from sch.implnet_sch_crossda import implnet_sch_crossda -from jobs.implnet_jobs_cuhk import implnet_job_cuhk -from sch.implnet_sch_cuhk import implnet_sch_cuhk -from jobs.implnet_jobs_tdi import implnet_job_tdi -from sch.implnet_sch_tdi import implnet_sch_tdi -from jobs.implnet_jobs_darus import implnet_job_darus -from sch.implnet_sch_darus import implnet_sch_darus -from jobs.implnet_jobs_irs import implnet_job_irs -from sch.implnet_sch_irs import implnet_sch_irs -from jobs.implnet_jobs_sceincespo import implnet_job_sceincespo -from sch.implnet_sch_sceincespo import implnet_sch_sceincespo -from jobs.implnet_jobs_edatos import implnet_job_edatos -from sch.implnet_sch_edatos import implnet_sch_edatos -from jobs.implnet_jobs_netherland import implnet_job_netherland -from sch.implnet_sch_netherland import implnet_sch_netherland -from jobs.implnet_jobs_norway import implnet_job_norway -from sch.implnet_sch_norway import implnet_sch_norway -from jobs.implnet_jobs_ntu import implnet_job_ntu -from sch.implnet_sch_ntu import implnet_sch_ntu -from jobs.implnet_jobs_fiu import implnet_job_fiu -from sch.implnet_sch_fiu import implnet_sch_fiu -from jobs.implnet_jobs_gro import implnet_job_gro -from sch.implnet_sch_gro import implnet_sch_gro -from jobs.implnet_jobs_harvard import implnet_job_harvard -from sch.implnet_sch_harvard import implnet_sch_harvard -from jobs.implnet_jobs_hord import implnet_job_hord -from sch.implnet_sch_hord import implnet_sch_hord -from jobs.implnet_jobs_ibict import implnet_job_ibict -from sch.implnet_sch_ibict import implnet_sch_ibict -from jobs.implnet_jobs_icrisat import implnet_job_icrisat -from sch.implnet_sch_icrisat import implnet_sch_icrisat -from jobs.implnet_jobs_ifdc import implnet_job_ifdc -from sch.implnet_sch_ifdc import implnet_sch_ifdc -from jobs.implnet_jobs_ifsttar import implnet_job_ifsttar -from sch.implnet_sch_ifsttar import implnet_sch_ifsttar -from jobs.implnet_jobs_iisg import implnet_job_iisg -from sch.implnet_sch_iisg import implnet_sch_iisg -from jobs.implnet_jobs_irl import implnet_job_irl -from sch.implnet_sch_irl import implnet_sch_irl -from jobs.implnet_jobs_ipc import implnet_job_ipc -from sch.implnet_sch_ipc import implnet_sch_ipc -from jobs.implnet_jobs_iit import implnet_job_iit -from sch.implnet_sch_iit import implnet_sch_iit -from jobs.implnet_jobs_hopkins import implnet_job_hopkins -from sch.implnet_sch_hopkins import implnet_sch_hopkins -from jobs.implnet_jobs_julich import implnet_job_julich -from sch.implnet_sch_julich import implnet_sch_julich -from jobs.implnet_jobs_uva import implnet_job_uva -from sch.implnet_sch_uva import implnet_sch_uva -from jobs.implnet_jobs_rin import implnet_job_rin -from sch.implnet_sch_rin import implnet_sch_rin -from jobs.implnet_jobs_lida import implnet_job_lida -from sch.implnet_sch_lida import implnet_sch_lida -from jobs.implnet_jobs_icarda import implnet_job_icarda -from sch.implnet_sch_icarda import implnet_sch_icarda -from jobs.implnet_jobs_nioz import implnet_job_nioz -from sch.implnet_sch_nioz import implnet_sch_nioz -from jobs.implnet_jobs_ucdl import implnet_job_ucdl -from sch.implnet_sch_ucdl import implnet_sch_ucdl -from jobs.implnet_jobs_ofd import implnet_job_ofd -from sch.implnet_sch_ofd import implnet_sch_ofd -from jobs.implnet_jobs_peking import implnet_job_peking -from sch.implnet_sch_peking import implnet_sch_peking -from jobs.implnet_jobs_pucdp import implnet_job_pucdp -from sch.implnet_sch_pucdp import implnet_sch_pucdp -from jobs.implnet_jobs_qdr import implnet_job_qdr -from sch.implnet_sch_qdr import implnet_sch_qdr -from jobs.implnet_jobs_chile import implnet_job_chile -from sch.implnet_sch_chile import implnet_sch_chile -from jobs.implnet_jobs_rosario import implnet_job_rosario -from sch.implnet_sch_rosario import implnet_sch_rosario -from jobs.implnet_jobs_pesquisa import implnet_job_pesquisa -from sch.implnet_sch_pesquisa import implnet_sch_pesquisa -from jobs.implnet_jobs_rsu import implnet_job_rsu -from sch.implnet_sch_rsu import implnet_sch_rsu -from jobs.implnet_jobs_tdl import implnet_job_tdl -from sch.implnet_sch_tdl import implnet_sch_tdl -from jobs.implnet_jobs_ucla import implnet_job_ucla -from sch.implnet_sch_ucla import implnet_sch_ucla -from jobs.implnet_jobs_unb import implnet_job_unb -from sch.implnet_sch_unb import implnet_sch_unb -from jobs.implnet_jobs_unc import implnet_job_unc -from sch.implnet_sch_unc import implnet_sch_unc -from jobs.implnet_jobs_manitoba import implnet_job_manitoba -from sch.implnet_sch_manitoba import implnet_sch_manitoba -from jobs.implnet_jobs_milano import implnet_job_milano -from sch.implnet_sch_milano import implnet_sch_milano -from jobs.implnet_jobs_uwi import implnet_job_uwi -from sch.implnet_sch_uwi import implnet_sch_uwi -from jobs.implnet_jobs_vtti import implnet_job_vtti -from sch.implnet_sch_vtti import implnet_sch_vtti -from jobs.implnet_jobs_wardr import implnet_job_wardr -from sch.implnet_sch_wardr import implnet_sch_wardr -from jobs.implnet_jobs_yalenus import implnet_job_yalenus -from sch.implnet_sch_yalenus import implnet_sch_yalenus - -@repository -def gleaner(): - jobs = [implnet_job_arecibo, implnet_job_aws, implnet_job_cyvers, implnet_job_drp, implnet_job_dryad, implnet_job_matcommons, implnet_job_mdf, implnet_job_neon, implnet_job_abacus, implnet_job_acss, implnet_job_adf, implnet_job_asulrdr, implnet_job_aussda, implnet_job_borealis, implnet_job_cifor, implnet_job_cimmyt, implnet_job_cora, implnet_job_crossda, implnet_job_cuhk, implnet_job_tdi, implnet_job_darus, implnet_job_irs, implnet_job_sceincespo, implnet_job_edatos, implnet_job_netherland, implnet_job_norway, implnet_job_ntu, implnet_job_fiu, implnet_job_gro, implnet_job_harvard, implnet_job_hord, implnet_job_ibict, implnet_job_icrisat, implnet_job_ifdc, implnet_job_ifsttar, implnet_job_iisg, implnet_job_irl, implnet_job_ipc, implnet_job_iit, implnet_job_hopkins, implnet_job_julich, implnet_job_uva, implnet_job_rin, implnet_job_lida, implnet_job_icarda, implnet_job_nioz, implnet_job_ucdl, implnet_job_ofd, implnet_job_peking, implnet_job_pucdp, implnet_job_qdr, implnet_job_chile, implnet_job_rosario, implnet_job_pesquisa, implnet_job_rsu, implnet_job_tdl, implnet_job_ucla, implnet_job_unb, implnet_job_unc, implnet_job_manitoba, implnet_job_milano, implnet_job_uwi, implnet_job_vtti, implnet_job_wardr, implnet_job_yalenus] - schedules = [implnet_sch_arecibo, implnet_sch_aws, implnet_sch_cyvers, implnet_sch_drp, implnet_sch_dryad, implnet_sch_matcommons, implnet_sch_mdf, implnet_sch_neon, implnet_sch_abacus, implnet_sch_acss, implnet_sch_adf, implnet_sch_asulrdr, implnet_sch_aussda, implnet_sch_borealis, implnet_sch_cifor, implnet_sch_cimmyt, implnet_sch_cora, implnet_sch_crossda, implnet_sch_cuhk, implnet_sch_tdi, implnet_sch_darus, implnet_sch_irs, implnet_sch_sceincespo, implnet_sch_edatos, implnet_sch_netherland, implnet_sch_norway, implnet_sch_ntu, implnet_sch_fiu, implnet_sch_gro, implnet_sch_harvard, implnet_sch_hord, implnet_sch_ibict, implnet_sch_icrisat, implnet_sch_ifdc, implnet_sch_ifsttar, implnet_sch_iisg, implnet_sch_irl, implnet_sch_ipc, implnet_sch_iit, implnet_sch_hopkins, implnet_sch_julich, implnet_sch_uva, implnet_sch_rin, implnet_sch_lida, implnet_sch_icarda, implnet_sch_nioz, implnet_sch_ucdl, implnet_sch_ofd, implnet_sch_peking, implnet_sch_pucdp, implnet_sch_qdr, implnet_sch_chile, implnet_sch_rosario, implnet_sch_pesquisa, implnet_sch_rsu, implnet_sch_tdl, implnet_sch_ucla, implnet_sch_unb, implnet_sch_unc, implnet_sch_manitoba, implnet_sch_milano, implnet_sch_uwi, implnet_sch_vtti, implnet_sch_wardr, implnet_sch_yalenus] - - - return jobs + schedules diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_abacus.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_abacus.py deleted file mode 100644 index 11ab7013..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_abacus.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_abacus import implnet_job_abacus - -@schedule(cron_schedule="0 0 * * 1", job=implnet_job_abacus, execution_timezone="US/Central") -def implnet_sch_abacus(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_acss.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_acss.py deleted file mode 100644 index 91b4bd20..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_acss.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_acss import implnet_job_acss - -@schedule(cron_schedule="0 3 * * 1", job=implnet_job_acss, execution_timezone="US/Central") -def implnet_sch_acss(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_adf.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_adf.py deleted file mode 100644 index aab8c315..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_adf.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_adf import implnet_job_adf - -@schedule(cron_schedule="0 6 * * 1", job=implnet_job_adf, execution_timezone="US/Central") -def implnet_sch_adf(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_arecibo.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_arecibo.py deleted file mode 100644 index 24b535b8..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_arecibo.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_arecibo import implnet_job_arecibo - -@schedule(cron_schedule="0 0 * * 0", job=implnet_job_arecibo, execution_timezone="US/Central") -def implnet_sch_arecibo(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_asulrdr.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_asulrdr.py deleted file mode 100644 index d751d8c1..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_asulrdr.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_asulrdr import implnet_job_asulrdr - -@schedule(cron_schedule="0 9 * * 1", job=implnet_job_asulrdr, execution_timezone="US/Central") -def implnet_sch_asulrdr(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_aussda.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_aussda.py deleted file mode 100644 index 5e3a7de5..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_aussda.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_aussda import implnet_job_aussda - -@schedule(cron_schedule="0 12 * * 1", job=implnet_job_aussda, execution_timezone="US/Central") -def implnet_sch_aussda(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_aws.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_aws.py deleted file mode 100644 index e078f291..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_aws.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_aws import implnet_job_aws - -@schedule(cron_schedule="0 3 * * 0", job=implnet_job_aws, execution_timezone="US/Central") -def implnet_sch_aws(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_borealis.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_borealis.py deleted file mode 100644 index 6bea8376..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_borealis.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_borealis import implnet_job_borealis - -@schedule(cron_schedule="0 15 * * 1", job=implnet_job_borealis, execution_timezone="US/Central") -def implnet_sch_borealis(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_chile.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_chile.py deleted file mode 100644 index ea503613..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_chile.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_chile import implnet_job_chile - -@schedule(cron_schedule="0 9 * * 6", job=implnet_job_chile, execution_timezone="US/Central") -def implnet_sch_chile(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cifor.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cifor.py deleted file mode 100644 index 0cfb6379..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cifor.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cifor import implnet_job_cifor - -@schedule(cron_schedule="0 18 * * 1", job=implnet_job_cifor, execution_timezone="US/Central") -def implnet_sch_cifor(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cimmyt.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cimmyt.py deleted file mode 100644 index 2a1362af..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cimmyt.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cimmyt import implnet_job_cimmyt - -@schedule(cron_schedule="0 21 * * 1", job=implnet_job_cimmyt, execution_timezone="US/Central") -def implnet_sch_cimmyt(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cora.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cora.py deleted file mode 100644 index 74c44435..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cora.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cora import implnet_job_cora - -@schedule(cron_schedule="0 0 * * 2", job=implnet_job_cora, execution_timezone="US/Central") -def implnet_sch_cora(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_crossda.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_crossda.py deleted file mode 100644 index 754bb45d..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_crossda.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_crossda import implnet_job_crossda - -@schedule(cron_schedule="0 3 * * 2", job=implnet_job_crossda, execution_timezone="US/Central") -def implnet_sch_crossda(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cuhk.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cuhk.py deleted file mode 100644 index 8be91a5d..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cuhk.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cuhk import implnet_job_cuhk - -@schedule(cron_schedule="0 6 * * 2", job=implnet_job_cuhk, execution_timezone="US/Central") -def implnet_sch_cuhk(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cyvers.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cyvers.py deleted file mode 100644 index fe5dbd1b..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_cyvers.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cyvers import implnet_job_cyvers - -@schedule(cron_schedule="0 6 * * 0", job=implnet_job_cyvers, execution_timezone="US/Central") -def implnet_sch_cyvers(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_darus.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_darus.py deleted file mode 100644 index ce481a78..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_darus.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_darus import implnet_job_darus - -@schedule(cron_schedule="0 12 * * 2", job=implnet_job_darus, execution_timezone="US/Central") -def implnet_sch_darus(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_drp.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_drp.py deleted file mode 100644 index 58db9727..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_drp.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_drp import implnet_job_drp - -@schedule(cron_schedule="0 9 * * 0", job=implnet_job_drp, execution_timezone="US/Central") -def implnet_sch_drp(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_dryad.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_dryad.py deleted file mode 100644 index d8d38019..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_dryad.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_dryad import implnet_job_dryad - -@schedule(cron_schedule="0 12 * * 0", job=implnet_job_dryad, execution_timezone="US/Central") -def implnet_sch_dryad(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_edatos.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_edatos.py deleted file mode 100644 index ebcf8c9c..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_edatos.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_edatos import implnet_job_edatos - -@schedule(cron_schedule="0 21 * * 2", job=implnet_job_edatos, execution_timezone="US/Central") -def implnet_sch_edatos(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_fiu.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_fiu.py deleted file mode 100644 index aabd7ded..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_fiu.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_fiu import implnet_job_fiu - -@schedule(cron_schedule="0 9 * * 3", job=implnet_job_fiu, execution_timezone="US/Central") -def implnet_sch_fiu(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_gro.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_gro.py deleted file mode 100644 index 1b90b0be..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_gro.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_gro import implnet_job_gro - -@schedule(cron_schedule="0 12 * * 3", job=implnet_job_gro, execution_timezone="US/Central") -def implnet_sch_gro(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_harvard.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_harvard.py deleted file mode 100644 index 56264ff6..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_harvard.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_harvard import implnet_job_harvard - -@schedule(cron_schedule="0 15 * * 3", job=implnet_job_harvard, execution_timezone="US/Central") -def implnet_sch_harvard(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_hopkins.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_hopkins.py deleted file mode 100644 index 556a21f9..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_hopkins.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_hopkins import implnet_job_hopkins - -@schedule(cron_schedule="0 21 * * 4", job=implnet_job_hopkins, execution_timezone="US/Central") -def implnet_sch_hopkins(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_hord.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_hord.py deleted file mode 100644 index 745e7e5d..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_hord.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_hord import implnet_job_hord - -@schedule(cron_schedule="0 18 * * 3", job=implnet_job_hord, execution_timezone="US/Central") -def implnet_sch_hord(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ibict.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ibict.py deleted file mode 100644 index dd122902..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ibict.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_ibict import implnet_job_ibict - -@schedule(cron_schedule="0 21 * * 3", job=implnet_job_ibict, execution_timezone="US/Central") -def implnet_sch_ibict(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_icarda.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_icarda.py deleted file mode 100644 index 6a5b77dd..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_icarda.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_icarda import implnet_job_icarda - -@schedule(cron_schedule="0 12 * * 5", job=implnet_job_icarda, execution_timezone="US/Central") -def implnet_sch_icarda(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_icrisat.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_icrisat.py deleted file mode 100644 index 16aae845..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_icrisat.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_icrisat import implnet_job_icrisat - -@schedule(cron_schedule="0 0 * * 4", job=implnet_job_icrisat, execution_timezone="US/Central") -def implnet_sch_icrisat(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ifdc.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ifdc.py deleted file mode 100644 index 8ad696d4..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ifdc.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_ifdc import implnet_job_ifdc - -@schedule(cron_schedule="0 3 * * 4", job=implnet_job_ifdc, execution_timezone="US/Central") -def implnet_sch_ifdc(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ifsttar.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ifsttar.py deleted file mode 100644 index 6ec28f67..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ifsttar.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_ifsttar import implnet_job_ifsttar - -@schedule(cron_schedule="0 6 * * 4", job=implnet_job_ifsttar, execution_timezone="US/Central") -def implnet_sch_ifsttar(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_iisg.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_iisg.py deleted file mode 100644 index 3c896328..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_iisg.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_iisg import implnet_job_iisg - -@schedule(cron_schedule="0 9 * * 4", job=implnet_job_iisg, execution_timezone="US/Central") -def implnet_sch_iisg(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_iit.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_iit.py deleted file mode 100644 index ebea679c..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_iit.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_iit import implnet_job_iit - -@schedule(cron_schedule="0 18 * * 4", job=implnet_job_iit, execution_timezone="US/Central") -def implnet_sch_iit(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ipc.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ipc.py deleted file mode 100644 index 7411b495..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ipc.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_ipc import implnet_job_ipc - -@schedule(cron_schedule="0 15 * * 4", job=implnet_job_ipc, execution_timezone="US/Central") -def implnet_sch_ipc(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_irl.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_irl.py deleted file mode 100644 index b3cab340..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_irl.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_irl import implnet_job_irl - -@schedule(cron_schedule="0 12 * * 4", job=implnet_job_irl, execution_timezone="US/Central") -def implnet_sch_irl(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_irs.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_irs.py deleted file mode 100644 index e3038c44..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_irs.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_irs import implnet_job_irs - -@schedule(cron_schedule="0 15 * * 2", job=implnet_job_irs, execution_timezone="US/Central") -def implnet_sch_irs(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_julich.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_julich.py deleted file mode 100644 index e9d14937..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_julich.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_julich import implnet_job_julich - -@schedule(cron_schedule="0 0 * * 5", job=implnet_job_julich, execution_timezone="US/Central") -def implnet_sch_julich(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_lida.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_lida.py deleted file mode 100644 index 1f9fcf74..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_lida.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_lida import implnet_job_lida - -@schedule(cron_schedule="0 9 * * 5", job=implnet_job_lida, execution_timezone="US/Central") -def implnet_sch_lida(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_manitoba.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_manitoba.py deleted file mode 100644 index e736c4af..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_manitoba.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_manitoba import implnet_job_manitoba - -@schedule(cron_schedule="0 9 * * 0", job=implnet_job_manitoba, execution_timezone="US/Central") -def implnet_sch_manitoba(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_matcommons.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_matcommons.py deleted file mode 100644 index 7c85eaeb..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_matcommons.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_matcommons import implnet_job_matcommons - -@schedule(cron_schedule="0 15 * * 0", job=implnet_job_matcommons, execution_timezone="US/Central") -def implnet_sch_matcommons(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_mdf.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_mdf.py deleted file mode 100644 index 25197950..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_mdf.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_mdf import implnet_job_mdf - -@schedule(cron_schedule="0 18 * * 0", job=implnet_job_mdf, execution_timezone="US/Central") -def implnet_sch_mdf(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_milano.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_milano.py deleted file mode 100644 index 06072f0b..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_milano.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_milano import implnet_job_milano - -@schedule(cron_schedule="0 12 * * 0", job=implnet_job_milano, execution_timezone="US/Central") -def implnet_sch_milano(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_neon.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_neon.py deleted file mode 100644 index 5a26d9cc..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_neon.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_neon import implnet_job_neon - -@schedule(cron_schedule="0 21 * * 0", job=implnet_job_neon, execution_timezone="US/Central") -def implnet_sch_neon(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_netherland.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_netherland.py deleted file mode 100644 index 1711cde9..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_netherland.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_netherland import implnet_job_netherland - -@schedule(cron_schedule="0 0 * * 3", job=implnet_job_netherland, execution_timezone="US/Central") -def implnet_sch_netherland(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_nioz.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_nioz.py deleted file mode 100644 index de7e3899..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_nioz.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_nioz import implnet_job_nioz - -@schedule(cron_schedule="0 15 * * 5", job=implnet_job_nioz, execution_timezone="US/Central") -def implnet_sch_nioz(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_norway.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_norway.py deleted file mode 100644 index 29fe289b..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_norway.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_norway import implnet_job_norway - -@schedule(cron_schedule="0 3 * * 3", job=implnet_job_norway, execution_timezone="US/Central") -def implnet_sch_norway(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ntu.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ntu.py deleted file mode 100644 index 2aa2a95c..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ntu.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_ntu import implnet_job_ntu - -@schedule(cron_schedule="0 6 * * 3", job=implnet_job_ntu, execution_timezone="US/Central") -def implnet_sch_ntu(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ofd.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ofd.py deleted file mode 100644 index d803ed4d..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ofd.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_ofd import implnet_job_ofd - -@schedule(cron_schedule="0 21 * * 5", job=implnet_job_ofd, execution_timezone="US/Central") -def implnet_sch_ofd(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_peking.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_peking.py deleted file mode 100644 index 6306642b..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_peking.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_peking import implnet_job_peking - -@schedule(cron_schedule="0 0 * * 6", job=implnet_job_peking, execution_timezone="US/Central") -def implnet_sch_peking(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_pesquisa.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_pesquisa.py deleted file mode 100644 index e4f2d6df..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_pesquisa.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_pesquisa import implnet_job_pesquisa - -@schedule(cron_schedule="0 15 * * 6", job=implnet_job_pesquisa, execution_timezone="US/Central") -def implnet_sch_pesquisa(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_pucdp.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_pucdp.py deleted file mode 100644 index 2718840b..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_pucdp.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_pucdp import implnet_job_pucdp - -@schedule(cron_schedule="0 3 * * 6", job=implnet_job_pucdp, execution_timezone="US/Central") -def implnet_sch_pucdp(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_qdr.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_qdr.py deleted file mode 100644 index 8969f6ed..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_qdr.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_qdr import implnet_job_qdr - -@schedule(cron_schedule="0 6 * * 6", job=implnet_job_qdr, execution_timezone="US/Central") -def implnet_sch_qdr(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_rin.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_rin.py deleted file mode 100644 index 25383578..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_rin.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_rin import implnet_job_rin - -@schedule(cron_schedule="0 6 * * 5", job=implnet_job_rin, execution_timezone="US/Central") -def implnet_sch_rin(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_rosario.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_rosario.py deleted file mode 100644 index 9fa8e0d0..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_rosario.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_rosario import implnet_job_rosario - -@schedule(cron_schedule="0 12 * * 6", job=implnet_job_rosario, execution_timezone="US/Central") -def implnet_sch_rosario(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_rsu.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_rsu.py deleted file mode 100644 index 63021fe0..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_rsu.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_rsu import implnet_job_rsu - -@schedule(cron_schedule="0 18 * * 6", job=implnet_job_rsu, execution_timezone="US/Central") -def implnet_sch_rsu(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_sceincespo.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_sceincespo.py deleted file mode 100644 index 69537d9c..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_sceincespo.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_sceincespo import implnet_job_sceincespo - -@schedule(cron_schedule="0 18 * * 2", job=implnet_job_sceincespo, execution_timezone="US/Central") -def implnet_sch_sceincespo(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_tdi.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_tdi.py deleted file mode 100644 index e836b9c6..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_tdi.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_tdi import implnet_job_tdi - -@schedule(cron_schedule="0 9 * * 2", job=implnet_job_tdi, execution_timezone="US/Central") -def implnet_sch_tdi(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_tdl.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_tdl.py deleted file mode 100644 index cfacd56d..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_tdl.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_tdl import implnet_job_tdl - -@schedule(cron_schedule="0 21 * * 6", job=implnet_job_tdl, execution_timezone="US/Central") -def implnet_sch_tdl(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ucdl.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ucdl.py deleted file mode 100644 index e624cb9d..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ucdl.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_ucdl import implnet_job_ucdl - -@schedule(cron_schedule="0 18 * * 5", job=implnet_job_ucdl, execution_timezone="US/Central") -def implnet_sch_ucdl(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ucla.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ucla.py deleted file mode 100644 index f41b3af2..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_ucla.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_ucla import implnet_job_ucla - -@schedule(cron_schedule="0 0 * * 0", job=implnet_job_ucla, execution_timezone="US/Central") -def implnet_sch_ucla(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_unb.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_unb.py deleted file mode 100644 index ff7a9689..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_unb.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_unb import implnet_job_unb - -@schedule(cron_schedule="0 3 * * 0", job=implnet_job_unb, execution_timezone="US/Central") -def implnet_sch_unb(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_unc.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_unc.py deleted file mode 100644 index 68693458..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_unc.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_unc import implnet_job_unc - -@schedule(cron_schedule="0 6 * * 0", job=implnet_job_unc, execution_timezone="US/Central") -def implnet_sch_unc(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_uva.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_uva.py deleted file mode 100644 index 263453bd..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_uva.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_uva import implnet_job_uva - -@schedule(cron_schedule="0 3 * * 5", job=implnet_job_uva, execution_timezone="US/Central") -def implnet_sch_uva(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_uwi.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_uwi.py deleted file mode 100644 index 63bd54b4..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_uwi.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_uwi import implnet_job_uwi - -@schedule(cron_schedule="0 15 * * 0", job=implnet_job_uwi, execution_timezone="US/Central") -def implnet_sch_uwi(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_vtti.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_vtti.py deleted file mode 100644 index 4cb97dfa..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_vtti.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_vtti import implnet_job_vtti - -@schedule(cron_schedule="0 18 * * 0", job=implnet_job_vtti, execution_timezone="US/Central") -def implnet_sch_vtti(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_wardr.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_wardr.py deleted file mode 100644 index 0f7fe00f..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_wardr.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_wardr import implnet_job_wardr - -@schedule(cron_schedule="0 21 * * 0", job=implnet_job_wardr, execution_timezone="US/Central") -def implnet_sch_wardr(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_yalenus.py b/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_yalenus.py deleted file mode 100644 index 4e29c86d..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/sch/implnet_sch_yalenus.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_yalenus import implnet_job_yalenus - -@schedule(cron_schedule="0 0 * * 1", job=implnet_job_yalenus, execution_timezone="US/Central") -def implnet_sch_yalenus(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-nsdf/output/workspace.yaml b/dagster/implnets/generatedCode/implnet-nsdf/output/workspace.yaml deleted file mode 100644 index 54490e1d..00000000 --- a/dagster/implnets/generatedCode/implnet-nsdf/output/workspace.yaml +++ /dev/null @@ -1,4 +0,0 @@ -load_from: - - python_file: - relative_path: "repositories/repository.py" - working_directory: . \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_africaioc.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_africaioc.py deleted file mode 100644 index 56965af5..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_africaioc.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_africaioc import harvest_africaioc - -@job -def implnet_job_africaioc(): - harvest_africaioc() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_aquadocs.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_aquadocs.py deleted file mode 100644 index 8d384135..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_aquadocs.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_aquadocs import harvest_aquadocs - -@job -def implnet_job_aquadocs(): - harvest_aquadocs() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_benguelacc.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_benguelacc.py deleted file mode 100644 index 650fe167..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_benguelacc.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_benguelacc import harvest_benguelacc - -@job -def implnet_job_benguelacc(): - harvest_benguelacc() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_caribbeanmarineatlas.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_caribbeanmarineatlas.py deleted file mode 100644 index 2761d08d..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_caribbeanmarineatlas.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_caribbeanmarineatlas import harvest_caribbeanmarineatlas - -@job -def implnet_job_caribbeanmarineatlas(): - harvest_caribbeanmarineatlas() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_cioos.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_cioos.py deleted file mode 100644 index b16b90cd..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_cioos.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_cioos import harvest_cioos - -@job -def implnet_job_cioos(): - harvest_cioos() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_edmerp.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_edmerp.py deleted file mode 100644 index 3415a757..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_edmerp.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_edmerp import harvest_edmerp - -@job -def implnet_job_edmerp(): - harvest_edmerp() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_edmo.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_edmo.py deleted file mode 100644 index bc379313..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_edmo.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_edmo import harvest_edmo - -@job -def implnet_job_edmo(): - harvest_edmo() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_emodnet.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_emodnet.py deleted file mode 100644 index 34329714..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_emodnet.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_emodnet import harvest_emodnet - -@job -def implnet_job_emodnet(): - harvest_emodnet() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanevents.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanevents.py deleted file mode 100644 index a407682d..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanevents.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_euroceanevents import harvest_euroceanevents - -@job -def implnet_job_euroceanevents(): - harvest_euroceanevents() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanexperts.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanexperts.py deleted file mode 100644 index 8e937e18..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanexperts.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_euroceanexperts import harvest_euroceanexperts - -@job -def implnet_job_euroceanexperts(): - harvest_euroceanexperts() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceaninstitutions.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceaninstitutions.py deleted file mode 100644 index 243f18df..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceaninstitutions.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_euroceaninstitutions import harvest_euroceaninstitutions - -@job -def implnet_job_euroceaninstitutions(): - harvest_euroceaninstitutions() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanorgs.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanorgs.py deleted file mode 100644 index e5b703f8..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanorgs.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_euroceanorgs import harvest_euroceanorgs - -@job -def implnet_job_euroceanorgs(): - harvest_euroceanorgs() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanprojects.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanprojects.py deleted file mode 100644 index d96782bd..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanprojects.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_euroceanprojects import harvest_euroceanprojects - -@job -def implnet_job_euroceanprojects(): - harvest_euroceanprojects() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceantraining.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceantraining.py deleted file mode 100644 index 3c0a1903..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceantraining.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_euroceantraining import harvest_euroceantraining - -@job -def implnet_job_euroceantraining(): - harvest_euroceantraining() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanvessels.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanvessels.py deleted file mode 100644 index e21b1990..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_euroceanvessels.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_euroceanvessels import harvest_euroceanvessels - -@job -def implnet_job_euroceanvessels(): - harvest_euroceanvessels() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_inanodc.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_inanodc.py deleted file mode 100644 index e432faed..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_inanodc.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_inanodc import harvest_inanodc - -@job -def implnet_job_inanodc(): - harvest_inanodc() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemardocuments.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemardocuments.py deleted file mode 100644 index 35197b05..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemardocuments.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_invemardocuments import harvest_invemardocuments - -@job -def implnet_job_invemardocuments(): - harvest_invemardocuments() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemarexperts.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemarexperts.py deleted file mode 100644 index 32e94d0c..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemarexperts.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_invemarexperts import harvest_invemarexperts - -@job -def implnet_job_invemarexperts(): - harvest_invemarexperts() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemarinstitutions.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemarinstitutions.py deleted file mode 100644 index 92cf2724..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemarinstitutions.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_invemarinstitutions import harvest_invemarinstitutions - -@job -def implnet_job_invemarinstitutions(): - harvest_invemarinstitutions() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemartraining.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemartraining.py deleted file mode 100644 index efb05401..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemartraining.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_invemartraining import harvest_invemartraining - -@job -def implnet_job_invemartraining(): - harvest_invemartraining() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemarvessels.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemarvessels.py deleted file mode 100644 index 356bdf6d..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_invemarvessels.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_invemarvessels import harvest_invemarvessels - -@job -def implnet_job_invemarvessels(): - harvest_invemarvessels() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_marinetraining.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_marinetraining.py deleted file mode 100644 index ecf79a91..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_marinetraining.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_marinetraining import harvest_marinetraining - -@job -def implnet_job_marinetraining(): - harvest_marinetraining() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_maspawio.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_maspawio.py deleted file mode 100644 index 7352ae89..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_maspawio.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_maspawio import harvest_maspawio - -@job -def implnet_job_maspawio(): - harvest_maspawio() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_obis.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_obis.py deleted file mode 100644 index e04416ef..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_obis.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_obis import harvest_obis - -@job -def implnet_job_obis(): - harvest_obis() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_obps.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_obps.py deleted file mode 100644 index 6477fe4a..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_obps.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_obps import harvest_obps - -@job -def implnet_job_obps(): - harvest_obps() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_oceanexperts.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_oceanexperts.py deleted file mode 100644 index ea8d50ef..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_oceanexperts.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_oceanexperts import harvest_oceanexperts - -@job -def implnet_job_oceanexperts(): - harvest_oceanexperts() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_oceanscape.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_oceanscape.py deleted file mode 100644 index 2b842fbc..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_oceanscape.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_oceanscape import harvest_oceanscape - -@job -def implnet_job_oceanscape(): - harvest_oceanscape() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_pdh.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_pdh.py deleted file mode 100644 index 3e123926..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_pdh.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_pdh import harvest_pdh - -@job -def implnet_job_pdh(): - harvest_pdh() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_pogo.py b/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_pogo.py deleted file mode 100644 index 5feb230e..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/jobs/implnet_jobs_pogo.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_pogo import harvest_pogo - -@job -def implnet_job_pogo(): - harvest_pogo() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_africaioc.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_africaioc.py deleted file mode 100644 index 92541474..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_africaioc.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def africaioc_gleaner(context): - returned_value = gleanerio(("gleaner"), "africaioc") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def africaioc_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "africaioc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def africaioc_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "africaioc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def africaioc_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "africaioc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def africaioc_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "africaioc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_africaioc(): - harvest = africaioc_gleaner() - load1 = africaioc_nabu(harvest) - load2 = africaioc_nabuprov(load1) - load3 = africaioc_nabuorg(load2) - load4 = africaioc_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_aquadocs.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_aquadocs.py deleted file mode 100644 index 2617335b..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_aquadocs.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def aquadocs_gleaner(context): - returned_value = gleanerio(("gleaner"), "aquadocs") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def aquadocs_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "aquadocs") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def aquadocs_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "aquadocs") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def aquadocs_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "aquadocs") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def aquadocs_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "aquadocs") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_aquadocs(): - harvest = aquadocs_gleaner() - load1 = aquadocs_nabu(harvest) - load2 = aquadocs_nabuprov(load1) - load3 = aquadocs_nabuorg(load2) - load4 = aquadocs_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_benguelacc.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_benguelacc.py deleted file mode 100644 index 4167640d..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_benguelacc.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def benguelacc_gleaner(context): - returned_value = gleanerio(("gleaner"), "benguelacc") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def benguelacc_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "benguelacc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def benguelacc_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "benguelacc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def benguelacc_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "benguelacc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def benguelacc_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "benguelacc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_benguelacc(): - harvest = benguelacc_gleaner() - load1 = benguelacc_nabu(harvest) - load2 = benguelacc_nabuprov(load1) - load3 = benguelacc_nabuorg(load2) - load4 = benguelacc_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_caribbeanmarineatlas.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_caribbeanmarineatlas.py deleted file mode 100644 index 3b8f0c41..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_caribbeanmarineatlas.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def caribbeanmarineatlas_gleaner(context): - returned_value = gleanerio(("gleaner"), "caribbeanmarineatlas") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def caribbeanmarineatlas_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "caribbeanmarineatlas") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def caribbeanmarineatlas_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "caribbeanmarineatlas") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def caribbeanmarineatlas_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "caribbeanmarineatlas") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def caribbeanmarineatlas_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "caribbeanmarineatlas") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_caribbeanmarineatlas(): - harvest = caribbeanmarineatlas_gleaner() - load1 = caribbeanmarineatlas_nabu(harvest) - load2 = caribbeanmarineatlas_nabuprov(load1) - load3 = caribbeanmarineatlas_nabuorg(load2) - load4 = caribbeanmarineatlas_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_cioos.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_cioos.py deleted file mode 100644 index cb156905..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_cioos.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def cioos_gleaner(context): - returned_value = gleanerio(("gleaner"), "cioos") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def cioos_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "cioos") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def cioos_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "cioos") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def cioos_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "cioos") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def cioos_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "cioos") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_cioos(): - harvest = cioos_gleaner() - load1 = cioos_nabu(harvest) - load2 = cioos_nabuprov(load1) - load3 = cioos_nabuorg(load2) - load4 = cioos_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_edmerp.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_edmerp.py deleted file mode 100644 index 31ee259c..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_edmerp.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def edmerp_gleaner(context): - returned_value = gleanerio(("gleaner"), "edmerp") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def edmerp_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "edmerp") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def edmerp_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "edmerp") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def edmerp_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "edmerp") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def edmerp_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "edmerp") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_edmerp(): - harvest = edmerp_gleaner() - load1 = edmerp_nabu(harvest) - load2 = edmerp_nabuprov(load1) - load3 = edmerp_nabuorg(load2) - load4 = edmerp_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_edmo.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_edmo.py deleted file mode 100644 index 706e1b60..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_edmo.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def edmo_gleaner(context): - returned_value = gleanerio(("gleaner"), "edmo") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def edmo_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "edmo") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def edmo_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "edmo") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def edmo_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "edmo") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def edmo_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "edmo") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_edmo(): - harvest = edmo_gleaner() - load1 = edmo_nabu(harvest) - load2 = edmo_nabuprov(load1) - load3 = edmo_nabuorg(load2) - load4 = edmo_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_emodnet.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_emodnet.py deleted file mode 100644 index ffe2953a..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_emodnet.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def emodnet_gleaner(context): - returned_value = gleanerio(("gleaner"), "emodnet") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def emodnet_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "emodnet") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def emodnet_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "emodnet") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def emodnet_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "emodnet") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def emodnet_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "emodnet") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_emodnet(): - harvest = emodnet_gleaner() - load1 = emodnet_nabu(harvest) - load2 = emodnet_nabuprov(load1) - load3 = emodnet_nabuorg(load2) - load4 = emodnet_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanevents.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanevents.py deleted file mode 100644 index 40b74e75..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanevents.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def euroceanevents_gleaner(context): - returned_value = gleanerio(("gleaner"), "euroceanevents") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def euroceanevents_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "euroceanevents") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def euroceanevents_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "euroceanevents") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def euroceanevents_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "euroceanevents") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def euroceanevents_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "euroceanevents") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_euroceanevents(): - harvest = euroceanevents_gleaner() - load1 = euroceanevents_nabu(harvest) - load2 = euroceanevents_nabuprov(load1) - load3 = euroceanevents_nabuorg(load2) - load4 = euroceanevents_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanexperts.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanexperts.py deleted file mode 100644 index 0f30c008..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanexperts.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def euroceanexperts_gleaner(context): - returned_value = gleanerio(("gleaner"), "euroceanexperts") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def euroceanexperts_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "euroceanexperts") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def euroceanexperts_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "euroceanexperts") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def euroceanexperts_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "euroceanexperts") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def euroceanexperts_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "euroceanexperts") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_euroceanexperts(): - harvest = euroceanexperts_gleaner() - load1 = euroceanexperts_nabu(harvest) - load2 = euroceanexperts_nabuprov(load1) - load3 = euroceanexperts_nabuorg(load2) - load4 = euroceanexperts_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceaninstitutions.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceaninstitutions.py deleted file mode 100644 index abb012d7..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceaninstitutions.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def euroceaninstitutions_gleaner(context): - returned_value = gleanerio(("gleaner"), "euroceaninstitutions") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def euroceaninstitutions_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "euroceaninstitutions") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def euroceaninstitutions_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "euroceaninstitutions") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def euroceaninstitutions_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "euroceaninstitutions") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def euroceaninstitutions_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "euroceaninstitutions") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_euroceaninstitutions(): - harvest = euroceaninstitutions_gleaner() - load1 = euroceaninstitutions_nabu(harvest) - load2 = euroceaninstitutions_nabuprov(load1) - load3 = euroceaninstitutions_nabuorg(load2) - load4 = euroceaninstitutions_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanorgs.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanorgs.py deleted file mode 100644 index 1e221581..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanorgs.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def euroceanorgs_gleaner(context): - returned_value = gleanerio(("gleaner"), "euroceanorgs") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def euroceanorgs_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "euroceanorgs") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def euroceanorgs_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "euroceanorgs") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def euroceanorgs_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "euroceanorgs") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def euroceanorgs_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "euroceanorgs") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_euroceanorgs(): - harvest = euroceanorgs_gleaner() - load1 = euroceanorgs_nabu(harvest) - load2 = euroceanorgs_nabuprov(load1) - load3 = euroceanorgs_nabuorg(load2) - load4 = euroceanorgs_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanprojects.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanprojects.py deleted file mode 100644 index d76a388e..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanprojects.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def euroceanprojects_gleaner(context): - returned_value = gleanerio(("gleaner"), "euroceanprojects") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def euroceanprojects_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "euroceanprojects") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def euroceanprojects_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "euroceanprojects") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def euroceanprojects_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "euroceanprojects") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def euroceanprojects_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "euroceanprojects") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_euroceanprojects(): - harvest = euroceanprojects_gleaner() - load1 = euroceanprojects_nabu(harvest) - load2 = euroceanprojects_nabuprov(load1) - load3 = euroceanprojects_nabuorg(load2) - load4 = euroceanprojects_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceantraining.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceantraining.py deleted file mode 100644 index 89e918ec..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceantraining.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def euroceantraining_gleaner(context): - returned_value = gleanerio(("gleaner"), "euroceantraining") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def euroceantraining_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "euroceantraining") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def euroceantraining_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "euroceantraining") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def euroceantraining_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "euroceantraining") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def euroceantraining_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "euroceantraining") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_euroceantraining(): - harvest = euroceantraining_gleaner() - load1 = euroceantraining_nabu(harvest) - load2 = euroceantraining_nabuprov(load1) - load3 = euroceantraining_nabuorg(load2) - load4 = euroceantraining_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanvessels.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanvessels.py deleted file mode 100644 index 6691f026..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_euroceanvessels.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def euroceanvessels_gleaner(context): - returned_value = gleanerio(("gleaner"), "euroceanvessels") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def euroceanvessels_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "euroceanvessels") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def euroceanvessels_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "euroceanvessels") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def euroceanvessels_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "euroceanvessels") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def euroceanvessels_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "euroceanvessels") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_euroceanvessels(): - harvest = euroceanvessels_gleaner() - load1 = euroceanvessels_nabu(harvest) - load2 = euroceanvessels_nabuprov(load1) - load3 = euroceanvessels_nabuorg(load2) - load4 = euroceanvessels_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_inanodc.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_inanodc.py deleted file mode 100644 index feb61b6c..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_inanodc.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def inanodc_gleaner(context): - returned_value = gleanerio(("gleaner"), "inanodc") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def inanodc_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "inanodc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def inanodc_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "inanodc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def inanodc_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "inanodc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def inanodc_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "inanodc") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_inanodc(): - harvest = inanodc_gleaner() - load1 = inanodc_nabu(harvest) - load2 = inanodc_nabuprov(load1) - load3 = inanodc_nabuorg(load2) - load4 = inanodc_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemardocuments.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemardocuments.py deleted file mode 100644 index 0a3131bb..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemardocuments.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def invemardocuments_gleaner(context): - returned_value = gleanerio(("gleaner"), "invemardocuments") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def invemardocuments_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "invemardocuments") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def invemardocuments_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "invemardocuments") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def invemardocuments_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "invemardocuments") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def invemardocuments_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "invemardocuments") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_invemardocuments(): - harvest = invemardocuments_gleaner() - load1 = invemardocuments_nabu(harvest) - load2 = invemardocuments_nabuprov(load1) - load3 = invemardocuments_nabuorg(load2) - load4 = invemardocuments_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemarexperts.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemarexperts.py deleted file mode 100644 index 883bdef1..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemarexperts.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def invemarexperts_gleaner(context): - returned_value = gleanerio(("gleaner"), "invemarexperts") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def invemarexperts_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "invemarexperts") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def invemarexperts_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "invemarexperts") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def invemarexperts_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "invemarexperts") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def invemarexperts_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "invemarexperts") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_invemarexperts(): - harvest = invemarexperts_gleaner() - load1 = invemarexperts_nabu(harvest) - load2 = invemarexperts_nabuprov(load1) - load3 = invemarexperts_nabuorg(load2) - load4 = invemarexperts_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemarinstitutions.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemarinstitutions.py deleted file mode 100644 index c3dd3730..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemarinstitutions.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def invemarinstitutions_gleaner(context): - returned_value = gleanerio(("gleaner"), "invemarinstitutions") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def invemarinstitutions_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "invemarinstitutions") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def invemarinstitutions_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "invemarinstitutions") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def invemarinstitutions_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "invemarinstitutions") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def invemarinstitutions_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "invemarinstitutions") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_invemarinstitutions(): - harvest = invemarinstitutions_gleaner() - load1 = invemarinstitutions_nabu(harvest) - load2 = invemarinstitutions_nabuprov(load1) - load3 = invemarinstitutions_nabuorg(load2) - load4 = invemarinstitutions_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemartraining.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemartraining.py deleted file mode 100644 index 2e4b6e19..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemartraining.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def invemartraining_gleaner(context): - returned_value = gleanerio(("gleaner"), "invemartraining") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def invemartraining_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "invemartraining") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def invemartraining_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "invemartraining") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def invemartraining_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "invemartraining") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def invemartraining_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "invemartraining") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_invemartraining(): - harvest = invemartraining_gleaner() - load1 = invemartraining_nabu(harvest) - load2 = invemartraining_nabuprov(load1) - load3 = invemartraining_nabuorg(load2) - load4 = invemartraining_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemarvessels.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemarvessels.py deleted file mode 100644 index 50bad4c5..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_invemarvessels.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def invemarvessels_gleaner(context): - returned_value = gleanerio(("gleaner"), "invemarvessels") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def invemarvessels_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "invemarvessels") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def invemarvessels_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "invemarvessels") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def invemarvessels_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "invemarvessels") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def invemarvessels_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "invemarvessels") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_invemarvessels(): - harvest = invemarvessels_gleaner() - load1 = invemarvessels_nabu(harvest) - load2 = invemarvessels_nabuprov(load1) - load3 = invemarvessels_nabuorg(load2) - load4 = invemarvessels_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_marinetraining.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_marinetraining.py deleted file mode 100644 index 6ff02664..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_marinetraining.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def marinetraining_gleaner(context): - returned_value = gleanerio(("gleaner"), "marinetraining") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def marinetraining_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "marinetraining") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def marinetraining_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "marinetraining") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def marinetraining_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "marinetraining") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def marinetraining_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "marinetraining") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_marinetraining(): - harvest = marinetraining_gleaner() - load1 = marinetraining_nabu(harvest) - load2 = marinetraining_nabuprov(load1) - load3 = marinetraining_nabuorg(load2) - load4 = marinetraining_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_maspawio.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_maspawio.py deleted file mode 100644 index 03ecea27..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_maspawio.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def maspawio_gleaner(context): - returned_value = gleanerio(("gleaner"), "maspawio") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def maspawio_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "maspawio") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def maspawio_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "maspawio") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def maspawio_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "maspawio") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def maspawio_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "maspawio") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_maspawio(): - harvest = maspawio_gleaner() - load1 = maspawio_nabu(harvest) - load2 = maspawio_nabuprov(load1) - load3 = maspawio_nabuorg(load2) - load4 = maspawio_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_obis.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_obis.py deleted file mode 100644 index d8441180..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_obis.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def obis_gleaner(context): - returned_value = gleanerio(("gleaner"), "obis") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def obis_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "obis") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def obis_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "obis") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def obis_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "obis") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def obis_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "obis") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_obis(): - harvest = obis_gleaner() - load1 = obis_nabu(harvest) - load2 = obis_nabuprov(load1) - load3 = obis_nabuorg(load2) - load4 = obis_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_obps.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_obps.py deleted file mode 100644 index 8de17c32..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_obps.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def obps_gleaner(context): - returned_value = gleanerio(("gleaner"), "obps") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def obps_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "obps") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def obps_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "obps") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def obps_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "obps") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def obps_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "obps") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_obps(): - harvest = obps_gleaner() - load1 = obps_nabu(harvest) - load2 = obps_nabuprov(load1) - load3 = obps_nabuorg(load2) - load4 = obps_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_oceanexperts.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_oceanexperts.py deleted file mode 100644 index 576c1823..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_oceanexperts.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def oceanexperts_gleaner(context): - returned_value = gleanerio(("gleaner"), "oceanexperts") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def oceanexperts_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "oceanexperts") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def oceanexperts_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "oceanexperts") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def oceanexperts_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "oceanexperts") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def oceanexperts_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "oceanexperts") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_oceanexperts(): - harvest = oceanexperts_gleaner() - load1 = oceanexperts_nabu(harvest) - load2 = oceanexperts_nabuprov(load1) - load3 = oceanexperts_nabuorg(load2) - load4 = oceanexperts_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_oceanscape.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_oceanscape.py deleted file mode 100644 index ded284fe..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_oceanscape.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def oceanscape_gleaner(context): - returned_value = gleanerio(("gleaner"), "oceanscape") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def oceanscape_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "oceanscape") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def oceanscape_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "oceanscape") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def oceanscape_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "oceanscape") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def oceanscape_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "oceanscape") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_oceanscape(): - harvest = oceanscape_gleaner() - load1 = oceanscape_nabu(harvest) - load2 = oceanscape_nabuprov(load1) - load3 = oceanscape_nabuorg(load2) - load4 = oceanscape_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_pdh.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_pdh.py deleted file mode 100644 index 4e754958..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_pdh.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def pdh_gleaner(context): - returned_value = gleanerio(("gleaner"), "pdh") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def pdh_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "pdh") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def pdh_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "pdh") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def pdh_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "pdh") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def pdh_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "pdh") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_pdh(): - harvest = pdh_gleaner() - load1 = pdh_nabu(harvest) - load2 = pdh_nabuprov(load1) - load3 = pdh_nabuorg(load2) - load4 = pdh_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_pogo.py b/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_pogo.py deleted file mode 100644 index 4511e7f8..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/ops/implnet_ops_pogo.py +++ /dev/null @@ -1,293 +0,0 @@ -from dagster import op, graph, get_dagster_logger -import subprocess -import os, json, io -import urllib -from urllib import request -from dagster import job, op, get_dagster_logger -from minio import Minio -from minio.error import S3Error -from datetime import datetime - -# Vars and Envs - -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - -MINIO_URL = os.environ.get('GLEANER_MINIO_URL') -MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -MINIO_SSL = os.environ.get('GLEANER_MINIO_SSL') -MINIO_SECRET = os.environ.get('GLEANER_MINIO_SECRET') -MINIO_KEY = os.environ.get('GLEANER_MINIO_KEY') -MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') - - -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - try: - data = client.get_object(os.environ.get('GLEANER_MINIO_BUCKET'), object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - server = os.environ.get('GLEANER_MINIO_URL') + ":" + os.environ.get('GLEANER_MINIO_PORT') - client = Minio( - server, - secure=False, - access_key=os.environ.get('GLEANER_MINIO_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - client.put_object(os.environ.get('GLEANER_MINIO_BUCKET'), - objPrefix, - io.BytesIO(data), - len(data)) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") - - -def gleanerio(mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Create: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - CMD = ["--cfg", "/gleaner/gleanerconfig.yaml", "--source", source, "--rude"] - NAME = "gleaner01_" + source - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prune", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "prov/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "prefix", "--prefix", "orgs"] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", "/nabu/nabuconfig.yaml", "release", "--prefix", "summoned/" + source] - NAME = "nabu01_" + source - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - data = {} - data["Image"] = IMAGE - data["Cmd"] = CMD - - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - enva = [] - enva.append(str("MINIO_URL={}".format(MINIO_URL))) - enva.append(str("MINIO_PORT={}".format(MINIO_PORT))) - enva.append(str("MINIO_SSL={}".format(MINIO_SSL))) - enva.append(str("MINIO_SECRET={}".format(MINIO_SECRET))) - enva.append(str("MINIO_KEY={}".format(MINIO_KEY))) - enva.append(str("MINIO_BUCKET={}".format(MINIO_BUCKET))) - - data["Env"] = enva - - url = URL + 'containers/create' - params = { - "name": NAME - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - get_dagster_logger().info(f"URL: {str(url)}") - - req = request.Request(url, str.encode(json.dumps(data))) - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - c = r.read() - d = json.loads(c) - cid = d['Id'] - - print(r.status) - get_dagster_logger().info(f"Create: {str(r.status)}") - - # print(cid) - - ## ------------ Archive to load, which is how to send in the config (from where?) - - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': ARCHIVE_PATH - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - - # DATA = read_file_bytestream(ARCHIVE_FILE) - DATA = s3reader(ARCHIVE_FILE) - - req = request.Request(url, data=DATA, method="PUT") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - print(r.status) - get_dagster_logger().info(f"Archive: {str(r.status)}") - - # c = r.read() - # print(c) - # d = json.loads(c) - # print(d) - - ## ------------ Start - - url = URL + 'containers/' + cid + '/start' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Start: {str(r.status)}") - - ## ------------ Wait expect 200 - - url = URL + 'containers/' + cid + '/wait' - req = request.Request(url, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Wait: {str(r.status)}") - - ## ------------ Copy logs expect 200 - - url = URL + 'containers/' + cid + '/logs' - params = { - 'stdout': 'true', - 'stderr': 'false' - } - query_string = urllib.parse.urlencode(params) - - url = url + "?" + query_string - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - c = r.read() - - # write to file - # f = open(LOGFILE, 'w') - # f.write(str(c)) - # f.close() - - # write to s3 - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"Logs: {str(r.status)}") - - ## ------------ Remove expect 204 - - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Remove: {str(r.status)}") - - return 0 - -@op -def pogo_gleaner(context): - returned_value = gleanerio(("gleaner"), "pogo") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def pogo_nabu(context, msg: str): - returned_value = gleanerio(("nabu"), "pogo") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def pogo_nabuprov(context, msg: str): - returned_value = gleanerio(("prov"), "pogo") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def pogo_nabuorg(context, msg: str): - returned_value = gleanerio(("orgs"), "pogo") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def pogo_naburelease(context, msg: str): - returned_value = gleanerio(("release"), "pogo") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@graph -def harvest_pogo(): - harvest = pogo_gleaner() - load1 = pogo_nabu(harvest) - load2 = pogo_nabuprov(load1) - load3 = pogo_nabuorg(load2) - load4 = pogo_naburelease(load3) diff --git a/dagster/implnets/generatedCode/implnet-oih/output/repositories/repository.py b/dagster/implnets/generatedCode/implnet-oih/output/repositories/repository.py deleted file mode 100644 index f7f74595..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/repositories/repository.py +++ /dev/null @@ -1,371 +0,0 @@ -from dagster import repository -from jobs.implnet_jobs_nwisgw20 import implnet_job_nwisgw20 -from sch.implnet_sch_nwisgw20 import implnet_sch_nwisgw20 -from jobs.implnet_jobs_nwisgw22 import implnet_job_nwisgw22 -from sch.implnet_sch_nwisgw22 import implnet_sch_nwisgw22 -from jobs.implnet_jobs_nwisgw16 import implnet_job_nwisgw16 -from sch.implnet_sch_nwisgw16 import implnet_sch_nwisgw16 -from jobs.implnet_jobs_nwisgw12 import implnet_job_nwisgw12 -from sch.implnet_sch_nwisgw12 import implnet_sch_nwisgw12 -from jobs.implnet_jobs_nwisgw25 import implnet_job_nwisgw25 -from sch.implnet_sch_nwisgw25 import implnet_sch_nwisgw25 -from jobs.implnet_jobs_nwisgw14 import implnet_job_nwisgw14 -from sch.implnet_sch_nwisgw14 import implnet_sch_nwisgw14 -from jobs.implnet_jobs_nwisgw23 import implnet_job_nwisgw23 -from sch.implnet_sch_nwisgw23 import implnet_sch_nwisgw23 -from jobs.implnet_jobs_nwisgw10 import implnet_job_nwisgw10 -from sch.implnet_sch_nwisgw10 import implnet_sch_nwisgw10 -from jobs.implnet_jobs_nwisgw15 import implnet_job_nwisgw15 -from sch.implnet_sch_nwisgw15 import implnet_sch_nwisgw15 -from jobs.implnet_jobs_nwisgw2 import implnet_job_nwisgw2 -from sch.implnet_sch_nwisgw2 import implnet_sch_nwisgw2 -from jobs.implnet_jobs_nwisgw24 import implnet_job_nwisgw24 -from sch.implnet_sch_nwisgw24 import implnet_sch_nwisgw24 -from jobs.implnet_jobs_nwisgw9 import implnet_job_nwisgw9 -from sch.implnet_sch_nwisgw9 import implnet_sch_nwisgw9 -from jobs.implnet_jobs_nwisgw19 import implnet_job_nwisgw19 -from sch.implnet_sch_nwisgw19 import implnet_sch_nwisgw19 -from jobs.implnet_jobs_nwisgw28 import implnet_job_nwisgw28 -from sch.implnet_sch_nwisgw28 import implnet_sch_nwisgw28 -from jobs.implnet_jobs_nwisgw26 import implnet_job_nwisgw26 -from sch.implnet_sch_nwisgw26 import implnet_sch_nwisgw26 -from jobs.implnet_jobs_nwisgw5 import implnet_job_nwisgw5 -from sch.implnet_sch_nwisgw5 import implnet_sch_nwisgw5 -from jobs.implnet_jobs_nwisgw13 import implnet_job_nwisgw13 -from sch.implnet_sch_nwisgw13 import implnet_sch_nwisgw13 -from jobs.implnet_jobs_nwisgw6 import implnet_job_nwisgw6 -from sch.implnet_sch_nwisgw6 import implnet_sch_nwisgw6 -from jobs.implnet_jobs_nwisgw3 import implnet_job_nwisgw3 -from sch.implnet_sch_nwisgw3 import implnet_sch_nwisgw3 -from jobs.implnet_jobs_nwisgw4 import implnet_job_nwisgw4 -from sch.implnet_sch_nwisgw4 import implnet_sch_nwisgw4 -from jobs.implnet_jobs_nwisgw1 import implnet_job_nwisgw1 -from sch.implnet_sch_nwisgw1 import implnet_sch_nwisgw1 -from jobs.implnet_jobs_nwisgw21 import implnet_job_nwisgw21 -from sch.implnet_sch_nwisgw21 import implnet_sch_nwisgw21 -from jobs.implnet_jobs_nwisgw27 import implnet_job_nwisgw27 -from sch.implnet_sch_nwisgw27 import implnet_sch_nwisgw27 -from jobs.implnet_jobs_nwisgw8 import implnet_job_nwisgw8 -from sch.implnet_sch_nwisgw8 import implnet_sch_nwisgw8 -from jobs.implnet_jobs_nwisgw17 import implnet_job_nwisgw17 -from sch.implnet_sch_nwisgw17 import implnet_sch_nwisgw17 -from jobs.implnet_jobs_nwisgw18 import implnet_job_nwisgw18 -from sch.implnet_sch_nwisgw18 import implnet_sch_nwisgw18 -from jobs.implnet_jobs_nwisgw7 import implnet_job_nwisgw7 -from sch.implnet_sch_nwisgw7 import implnet_sch_nwisgw7 -from jobs.implnet_jobs_nwisgw11 import implnet_job_nwisgw11 -from sch.implnet_sch_nwisgw11 import implnet_sch_nwisgw11 -from jobs.implnet_jobs_nwisgw0 import implnet_job_nwisgw0 -from sch.implnet_sch_nwisgw0 import implnet_sch_nwisgw0 -from jobs.implnet_jobs_nwissite1 import implnet_job_nwissite1 -from sch.implnet_sch_nwissite1 import implnet_sch_nwissite1 -from jobs.implnet_jobs_nwissite3 import implnet_job_nwissite3 -from sch.implnet_sch_nwissite3 import implnet_sch_nwissite3 -from jobs.implnet_jobs_nwissite0 import implnet_job_nwissite0 -from sch.implnet_sch_nwissite0 import implnet_sch_nwissite0 -from jobs.implnet_jobs_nwissite2 import implnet_job_nwissite2 -from sch.implnet_sch_nwissite2 import implnet_sch_nwissite2 -from jobs.implnet_jobs_gfv11pois1 import implnet_job_gfv11pois1 -from sch.implnet_sch_gfv11pois1 import implnet_sch_gfv11pois1 -from jobs.implnet_jobs_gfv11pois0 import implnet_job_gfv11pois0 -from sch.implnet_sch_gfv11pois0 import implnet_sch_gfv11pois0 -from jobs.implnet_jobs_hydrologicunit0 import implnet_job_hydrologicunit0 -from sch.implnet_sch_hydrologicunit0 import implnet_sch_hydrologicunit0 -from jobs.implnet_jobs_damspids0 import implnet_job_damspids0 -from sch.implnet_sch_damspids0 import implnet_sch_damspids0 -from jobs.implnet_jobs_cuahsihishydrodataczhrids0 import implnet_job_cuahsihishydrodataczhrids0 -from sch.implnet_sch_cuahsihishydrodataczhrids0 import implnet_sch_cuahsihishydrodataczhrids0 -from jobs.implnet_jobs_cuahsihisnooksackmicroclimatenetworkids0 import implnet_job_cuahsihisnooksackmicroclimatenetworkids0 -from sch.implnet_sch_cuahsihisnooksackmicroclimatenetworkids0 import implnet_sch_cuahsihisnooksackmicroclimatenetworkids0 -from jobs.implnet_jobs_cuahsihisneonids0 import implnet_job_cuahsihisneonids0 -from sch.implnet_sch_cuahsihisneonids0 import implnet_sch_cuahsihisneonids0 -from jobs.implnet_jobs_cuahsihisglobalriversobservatoryids0 import implnet_job_cuahsihisglobalriversobservatoryids0 -from sch.implnet_sch_cuahsihisglobalriversobservatoryids0 import implnet_sch_cuahsihisglobalriversobservatoryids0 -from jobs.implnet_jobs_cuahsihistncwaterdataids0 import implnet_job_cuahsihistncwaterdataids0 -from sch.implnet_sch_cuahsihistncwaterdataids0 import implnet_sch_cuahsihistncwaterdataids0 -from jobs.implnet_jobs_cuahsihisscotlandnwisids0 import implnet_job_cuahsihisscotlandnwisids0 -from sch.implnet_sch_cuahsihisscotlandnwisids0 import implnet_sch_cuahsihisscotlandnwisids0 -from jobs.implnet_jobs_cuahsihisczoboulderids0 import implnet_job_cuahsihisczoboulderids0 -from sch.implnet_sch_cuahsihisczoboulderids0 import implnet_sch_cuahsihisczoboulderids0 -from jobs.implnet_jobs_cuahsihisyosemitehydroclimatenetworkids0 import implnet_job_cuahsihisyosemitehydroclimatenetworkids0 -from sch.implnet_sch_cuahsihisyosemitehydroclimatenetworkids0 import implnet_sch_cuahsihisyosemitehydroclimatenetworkids0 -from jobs.implnet_jobs_cuahsihismuddyriverids0 import implnet_job_cuahsihismuddyriverids0 -from sch.implnet_sch_cuahsihismuddyriverids0 import implnet_sch_cuahsihismuddyriverids0 -from jobs.implnet_jobs_cuahsihisczomercedids0 import implnet_job_cuahsihisczomercedids0 -from sch.implnet_sch_cuahsihisczomercedids0 import implnet_sch_cuahsihisczomercedids0 -from jobs.implnet_jobs_cuahsihisghcnids0 import implnet_job_cuahsihisghcnids0 -from sch.implnet_sch_cuahsihisghcnids0 import implnet_sch_cuahsihisghcnids0 -from jobs.implnet_jobs_cuahsihismmaatacamaids0 import implnet_job_cuahsihismmaatacamaids0 -from sch.implnet_sch_cuahsihismmaatacamaids0 import implnet_sch_cuahsihismmaatacamaids0 -from jobs.implnet_jobs_cuahsihisumbcwqids0 import implnet_job_cuahsihisumbcwqids0 -from sch.implnet_sch_cuahsihisumbcwqids0 import implnet_sch_cuahsihisumbcwqids0 -from jobs.implnet_jobs_cuahsihisgleonlakeannieids0 import implnet_job_cuahsihisgleonlakeannieids0 -from sch.implnet_sch_cuahsihisgleonlakeannieids0 import implnet_sch_cuahsihisgleonlakeannieids0 -from jobs.implnet_jobs_cuahsihisluwlids0 import implnet_job_cuahsihisluwlids0 -from sch.implnet_sch_cuahsihisluwlids0 import implnet_sch_cuahsihisluwlids0 -from jobs.implnet_jobs_cuahsihiscedarriverids0 import implnet_job_cuahsihiscedarriverids0 -from sch.implnet_sch_cuahsihiscedarriverids0 import implnet_sch_cuahsihiscedarriverids0 -from jobs.implnet_jobs_cuahsihisccbepdapids0 import implnet_job_cuahsihisccbepdapids0 -from sch.implnet_sch_cuahsihisccbepdapids0 import implnet_sch_cuahsihisccbepdapids0 -from jobs.implnet_jobs_cuahsihiskansasweatherdataids0 import implnet_job_cuahsihiskansasweatherdataids0 -from sch.implnet_sch_cuahsihiskansasweatherdataids0 import implnet_sch_cuahsihiskansasweatherdataids0 -from jobs.implnet_jobs_cuahsihisodmkentstateids0 import implnet_job_cuahsihisodmkentstateids0 -from sch.implnet_sch_cuahsihisodmkentstateids0 import implnet_sch_cuahsihisodmkentstateids0 -from jobs.implnet_jobs_cuahsihisgleondorsetids0 import implnet_job_cuahsihisgleondorsetids0 -from sch.implnet_sch_cuahsihisgleondorsetids0 import implnet_sch_cuahsihisgleondorsetids0 -from jobs.implnet_jobs_cuahsihisclarksburgspids0 import implnet_job_cuahsihisclarksburgspids0 -from sch.implnet_sch_cuahsihisclarksburgspids0 import implnet_sch_cuahsihisclarksburgspids0 -from jobs.implnet_jobs_cuahsihiscrwaids0 import implnet_job_cuahsihiscrwaids0 -from sch.implnet_sch_cuahsihiscrwaids0 import implnet_sch_cuahsihiscrwaids0 -from jobs.implnet_jobs_cuahsihiscuisoids0 import implnet_job_cuahsihiscuisoids0 -from sch.implnet_sch_cuahsihiscuisoids0 import implnet_sch_cuahsihiscuisoids0 -from jobs.implnet_jobs_cuahsihisprovorivergamutids0 import implnet_job_cuahsihisprovorivergamutids0 -from sch.implnet_sch_cuahsihisprovorivergamutids0 import implnet_sch_cuahsihisprovorivergamutids0 -from jobs.implnet_jobs_cuahsihisirwaids0 import implnet_job_cuahsihisirwaids0 -from sch.implnet_sch_cuahsihisirwaids0 import implnet_sch_cuahsihisirwaids0 -from jobs.implnet_jobs_cuahsihisczoluquilloids0 import implnet_job_cuahsihisczoluquilloids0 -from sch.implnet_sch_cuahsihisczoluquilloids0 import implnet_sch_cuahsihisczoluquilloids0 -from jobs.implnet_jobs_cuahsihistuolumnemdwids0 import implnet_job_cuahsihistuolumnemdwids0 -from sch.implnet_sch_cuahsihistuolumnemdwids0 import implnet_sch_cuahsihistuolumnemdwids0 -from jobs.implnet_jobs_cuahsihisrmblids0 import implnet_job_cuahsihisrmblids0 -from sch.implnet_sch_cuahsihisrmblids0 import implnet_sch_cuahsihisrmblids0 -from jobs.implnet_jobs_cuahsihispanolaodmids0 import implnet_job_cuahsihispanolaodmids0 -from sch.implnet_sch_cuahsihispanolaodmids0 import implnet_sch_cuahsihispanolaodmids0 -from jobs.implnet_jobs_cuahsihisnewnids0 import implnet_job_cuahsihisnewnids0 -from sch.implnet_sch_cuahsihisnewnids0 import implnet_sch_cuahsihisnewnids0 -from jobs.implnet_jobs_cuahsihisczoudelids0 import implnet_job_cuahsihisczoudelids0 -from sch.implnet_sch_cuahsihisczoudelids0 import implnet_sch_cuahsihisczoudelids0 -from jobs.implnet_jobs_cuahsihisfarmrwaids0 import implnet_job_cuahsihisfarmrwaids0 -from sch.implnet_sch_cuahsihisfarmrwaids0 import implnet_sch_cuahsihisfarmrwaids0 -from jobs.implnet_jobs_cuahsihisskcmilltownids0 import implnet_job_cuahsihisskcmilltownids0 -from sch.implnet_sch_cuahsihisskcmilltownids0 import implnet_sch_cuahsihisskcmilltownids0 -from jobs.implnet_jobs_cuahsihisumbcgwids0 import implnet_job_cuahsihisumbcgwids0 -from sch.implnet_sch_cuahsihisumbcgwids0 import implnet_sch_cuahsihisumbcgwids0 -from jobs.implnet_jobs_cuahsihisshalenetworkodmids0 import implnet_job_cuahsihisshalenetworkodmids0 -from sch.implnet_sch_cuahsihisshalenetworkodmids0 import implnet_sch_cuahsihisshalenetworkodmids0 -from jobs.implnet_jobs_cuahsihisnevadosids0 import implnet_job_cuahsihisnevadosids0 -from sch.implnet_sch_cuahsihisnevadosids0 import implnet_sch_cuahsihisnevadosids0 -from jobs.implnet_jobs_cuahsihisweiherbachids0 import implnet_job_cuahsihisweiherbachids0 -from sch.implnet_sch_cuahsihisweiherbachids0 import implnet_sch_cuahsihisweiherbachids0 -from jobs.implnet_jobs_cuahsihismazarriverprojectids0 import implnet_job_cuahsihismazarriverprojectids0 -from sch.implnet_sch_cuahsihismazarriverprojectids0 import implnet_sch_cuahsihismazarriverprojectids0 -from jobs.implnet_jobs_cuahsihisgleonsunapeeids0 import implnet_job_cuahsihisgleonsunapeeids0 -from sch.implnet_sch_cuahsihisgleonsunapeeids0 import implnet_sch_cuahsihisgleonsunapeeids0 -from jobs.implnet_jobs_cuahsihisorsancohabids0 import implnet_job_cuahsihisorsancohabids0 -from sch.implnet_sch_cuahsihisorsancohabids0 import implnet_sch_cuahsihisorsancohabids0 -from jobs.implnet_jobs_cuahsihismwraids0 import implnet_job_cuahsihismwraids0 -from sch.implnet_sch_cuahsihismwraids0 import implnet_sch_cuahsihismwraids0 -from jobs.implnet_jobs_cuahsihismaaeriids0 import implnet_job_cuahsihismaaeriids0 -from sch.implnet_sch_cuahsihismaaeriids0 import implnet_sch_cuahsihismaaeriids0 -from jobs.implnet_jobs_cuahsihisnceiww2ids0 import implnet_job_cuahsihisnceiww2ids0 -from sch.implnet_sch_cuahsihisnceiww2ids0 import implnet_sch_cuahsihisnceiww2ids0 -from jobs.implnet_jobs_cuahsihistarlandwaterqualityids0 import implnet_job_cuahsihistarlandwaterqualityids0 -from sch.implnet_sch_cuahsihistarlandwaterqualityids0 import implnet_sch_cuahsihistarlandwaterqualityids0 -from jobs.implnet_jobs_cuahsihislczoodm2ids0 import implnet_job_cuahsihislczoodm2ids0 -from sch.implnet_sch_cuahsihislczoodm2ids0 import implnet_sch_cuahsihislczoodm2ids0 -from jobs.implnet_jobs_cuahsihiscocorahsids0 import implnet_job_cuahsihiscocorahsids0 -from sch.implnet_sch_cuahsihiscocorahsids0 import implnet_sch_cuahsihiscocorahsids0 -from jobs.implnet_jobs_cuahsihisparalanaturalezaids0 import implnet_job_cuahsihisparalanaturalezaids0 -from sch.implnet_sch_cuahsihisparalanaturalezaids0 import implnet_sch_cuahsihisparalanaturalezaids0 -from jobs.implnet_jobs_cuahsihisczocatalinaids0 import implnet_job_cuahsihisczocatalinaids0 -from sch.implnet_sch_cuahsihisczocatalinaids0 import implnet_sch_cuahsihisczocatalinaids0 -from jobs.implnet_jobs_cuahsihisieeratwilkesuniversityids0 import implnet_job_cuahsihisieeratwilkesuniversityids0 -from sch.implnet_sch_cuahsihisieeratwilkesuniversityids0 import implnet_sch_cuahsihisieeratwilkesuniversityids0 -from jobs.implnet_jobs_cuahsihismudlakeids0 import implnet_job_cuahsihismudlakeids0 -from sch.implnet_sch_cuahsihismudlakeids0 import implnet_sch_cuahsihismudlakeids0 -from jobs.implnet_jobs_cuahsihismwdisids0 import implnet_job_cuahsihismwdisids0 -from sch.implnet_sch_cuahsihismwdisids0 import implnet_sch_cuahsihismwdisids0 -from jobs.implnet_jobs_cuahsihisloganriverids0 import implnet_job_cuahsihisloganriverids0 -from sch.implnet_sch_cuahsihisloganriverids0 import implnet_sch_cuahsihisloganriverids0 -from jobs.implnet_jobs_cuahsihisscanids0 import implnet_job_cuahsihisscanids0 -from sch.implnet_sch_cuahsihisscanids0 import implnet_sch_cuahsihisscanids0 -from jobs.implnet_jobs_cuahsihisnashrwaids0 import implnet_job_cuahsihisnashrwaids0 -from sch.implnet_sch_cuahsihisnashrwaids0 import implnet_sch_cuahsihisnashrwaids0 -from jobs.implnet_jobs_cuahsihismobilecrowdhydrologyids0 import implnet_job_cuahsihismobilecrowdhydrologyids0 -from sch.implnet_sch_cuahsihismobilecrowdhydrologyids0 import implnet_sch_cuahsihismobilecrowdhydrologyids0 -from jobs.implnet_jobs_cuahsihisandrewsforestlterids0 import implnet_job_cuahsihisandrewsforestlterids0 -from sch.implnet_sch_cuahsihisandrewsforestlterids0 import implnet_sch_cuahsihisandrewsforestlterids0 -from jobs.implnet_jobs_cuahsihisloganrivergamutids0 import implnet_job_cuahsihisloganrivergamutids0 -from sch.implnet_sch_cuahsihisloganrivergamutids0 import implnet_sch_cuahsihisloganrivergamutids0 -from jobs.implnet_jobs_cuahsihislittlebearriverids0 import implnet_job_cuahsihislittlebearriverids0 -from sch.implnet_sch_cuahsihislittlebearriverids0 import implnet_sch_cuahsihislittlebearriverids0 -from jobs.implnet_jobs_cuahsihislterntlwoodruffids0 import implnet_job_cuahsihislterntlwoodruffids0 -from sch.implnet_sch_cuahsihislterntlwoodruffids0 import implnet_sch_cuahsihislterntlwoodruffids0 -from jobs.implnet_jobs_cuahsihissagehencreekids0 import implnet_job_cuahsihissagehencreekids0 -from sch.implnet_sch_cuahsihissagehencreekids0 import implnet_sch_cuahsihissagehencreekids0 -from jobs.implnet_jobs_cuahsihisshalenetworkodmids1 import implnet_job_cuahsihisshalenetworkodmids1 -from sch.implnet_sch_cuahsihisshalenetworkodmids1 import implnet_sch_cuahsihisshalenetworkodmids1 -from jobs.implnet_jobs_cuahsihisfrcwqmids0 import implnet_job_cuahsihisfrcwqmids0 -from sch.implnet_sch_cuahsihisfrcwqmids0 import implnet_sch_cuahsihisfrcwqmids0 -from jobs.implnet_jobs_cuahsihishydrodataczdids0 import implnet_job_cuahsihishydrodataczdids0 -from sch.implnet_sch_cuahsihishydrodataczdids0 import implnet_sch_cuahsihishydrodataczdids0 -from jobs.implnet_jobs_cuahsihisdrwiids0 import implnet_job_cuahsihisdrwiids0 -from sch.implnet_sch_cuahsihisdrwiids0 import implnet_sch_cuahsihisdrwiids0 -from jobs.implnet_jobs_cuahsihisubwpadids0 import implnet_job_cuahsihisubwpadids0 -from sch.implnet_sch_cuahsihisubwpadids0 import implnet_sch_cuahsihisubwpadids0 -from jobs.implnet_jobs_cuahsihistrwaids0 import implnet_job_cuahsihistrwaids0 -from sch.implnet_sch_cuahsihistrwaids0 import implnet_sch_cuahsihistrwaids0 -from jobs.implnet_jobs_cuahsihisredbuttecreekgamutids0 import implnet_job_cuahsihisredbuttecreekgamutids0 -from sch.implnet_sch_cuahsihisredbuttecreekgamutids0 import implnet_sch_cuahsihisredbuttecreekgamutids0 -from jobs.implnet_jobs_cuahsihisglacialridgeids0 import implnet_job_cuahsihisglacialridgeids0 -from sch.implnet_sch_cuahsihisglacialridgeids0 import implnet_sch_cuahsihisglacialridgeids0 -from jobs.implnet_jobs_cuahsihisfcelterids0 import implnet_job_cuahsihisfcelterids0 -from sch.implnet_sch_cuahsihisfcelterids0 import implnet_sch_cuahsihisfcelterids0 -from jobs.implnet_jobs_cuahsihisczoarizids0 import implnet_job_cuahsihisczoarizids0 -from sch.implnet_sch_cuahsihisczoarizids0 import implnet_sch_cuahsihisczoarizids0 -from jobs.implnet_jobs_cuahsihiscalvinhhsids0 import implnet_job_cuahsihiscalvinhhsids0 -from sch.implnet_sch_cuahsihiscalvinhhsids0 import implnet_sch_cuahsihiscalvinhhsids0 -from jobs.implnet_jobs_cuahsihissnotelids0 import implnet_job_cuahsihissnotelids0 -from sch.implnet_sch_cuahsihissnotelids0 import implnet_sch_cuahsihissnotelids0 -from jobs.implnet_jobs_cuahsihisnevcanids0 import implnet_job_cuahsihisnevcanids0 -from sch.implnet_sch_cuahsihisnevcanids0 import implnet_sch_cuahsihisnevcanids0 -from jobs.implnet_jobs_cuahsihisczopsuids0 import implnet_job_cuahsihisczopsuids0 -from sch.implnet_sch_cuahsihisczopsuids0 import implnet_sch_cuahsihisczopsuids0 -from jobs.implnet_jobs_cuahsihisbrazilucbids0 import implnet_job_cuahsihisbrazilucbids0 -from sch.implnet_sch_cuahsihisbrazilucbids0 import implnet_sch_cuahsihisbrazilucbids0 -from jobs.implnet_jobs_cuahsihisgleonauburnids0 import implnet_job_cuahsihisgleonauburnids0 -from sch.implnet_sch_cuahsihisgleonauburnids0 import implnet_sch_cuahsihisgleonauburnids0 -from jobs.implnet_jobs_cuahsihislaselvastreamdischargeids0 import implnet_job_cuahsihislaselvastreamdischargeids0 -from sch.implnet_sch_cuahsihislaselvastreamdischargeids0 import implnet_sch_cuahsihislaselvastreamdischargeids0 -from jobs.implnet_jobs_cuahsihisisbenaids0 import implnet_job_cuahsihisisbenaids0 -from sch.implnet_sch_cuahsihisisbenaids0 import implnet_sch_cuahsihisisbenaids0 -from jobs.implnet_jobs_cuahsihisswedishmonitoringdataids0 import implnet_job_cuahsihisswedishmonitoringdataids0 -from sch.implnet_sch_cuahsihisswedishmonitoringdataids0 import implnet_sch_cuahsihisswedishmonitoringdataids0 -from jobs.implnet_jobs_cuahsihisunhsnowids0 import implnet_job_cuahsihisunhsnowids0 -from sch.implnet_sch_cuahsihisunhsnowids0 import implnet_sch_cuahsihisunhsnowids0 -from jobs.implnet_jobs_cuahsihishassbergeids0 import implnet_job_cuahsihishassbergeids0 -from sch.implnet_sch_cuahsihishassbergeids0 import implnet_sch_cuahsihishassbergeids0 -from jobs.implnet_jobs_cuahsihisnhgswofids0 import implnet_job_cuahsihisnhgswofids0 -from sch.implnet_sch_cuahsihisnhgswofids0 import implnet_sch_cuahsihisnhgswofids0 -from jobs.implnet_jobs_cuahsihisgonggaids0 import implnet_job_cuahsihisgonggaids0 -from sch.implnet_sch_cuahsihisgonggaids0 import implnet_sch_cuahsihisgonggaids0 -from jobs.implnet_jobs_cuahsihismopexids0 import implnet_job_cuahsihismopexids0 -from sch.implnet_sch_cuahsihismopexids0 import implnet_sch_cuahsihismopexids0 -from jobs.implnet_jobs_cagagespids0 import implnet_job_cagagespids0 -from sch.implnet_sch_cagagespids0 import implnet_sch_cagagespids0 -from jobs.implnet_jobs_sechydrgreg0 import implnet_job_sechydrgreg0 -from sch.implnet_sch_sechydrgreg0 import implnet_sch_sechydrgreg0 -from jobs.implnet_jobs_counties0 import implnet_job_counties0 -from sch.implnet_sch_counties0 import implnet_sch_counties0 -from jobs.implnet_jobs_pws0 import implnet_job_pws0 -from sch.implnet_sch_pws0 import implnet_sch_pws0 -from jobs.implnet_jobs_hu060 import implnet_job_hu060 -from sch.implnet_sch_hu060 import implnet_sch_hu060 -from jobs.implnet_jobs_nataq0 import implnet_job_nataq0 -from sch.implnet_sch_nataq0 import implnet_sch_nataq0 -from jobs.implnet_jobs_cbsa0 import implnet_job_cbsa0 -from sch.implnet_sch_cbsa0 import implnet_sch_cbsa0 -from jobs.implnet_jobs_hu080 import implnet_job_hu080 -from sch.implnet_sch_hu080 import implnet_sch_hu080 -from jobs.implnet_jobs_hu040 import implnet_job_hu040 -from sch.implnet_sch_hu040 import implnet_sch_hu040 -from jobs.implnet_jobs_princiaq0 import implnet_job_princiaq0 -from sch.implnet_sch_princiaq0 import implnet_sch_princiaq0 -from jobs.implnet_jobs_refgage0 import implnet_job_refgage0 -from sch.implnet_sch_refgage0 import implnet_sch_refgage0 -from jobs.implnet_jobs_refgage3 import implnet_job_refgage3 -from sch.implnet_sch_refgage3 import implnet_sch_refgage3 -from jobs.implnet_jobs_refgage2 import implnet_job_refgage2 -from sch.implnet_sch_refgage2 import implnet_sch_refgage2 -from jobs.implnet_jobs_refgage1 import implnet_job_refgage1 -from sch.implnet_sch_refgage1 import implnet_sch_refgage1 -from jobs.implnet_jobs_dams0 import implnet_job_dams0 -from sch.implnet_sch_dams0 import implnet_sch_dams0 -from jobs.implnet_jobs_dams1 import implnet_job_dams1 -from sch.implnet_sch_dams1 import implnet_sch_dams1 -from jobs.implnet_jobs_ua100 import implnet_job_ua100 -from sch.implnet_sch_ua100 import implnet_sch_ua100 -from jobs.implnet_jobs_states0 import implnet_job_states0 -from sch.implnet_sch_states0 import implnet_sch_states0 -from jobs.implnet_jobs_hu100 import implnet_job_hu100 -from sch.implnet_sch_hu100 import implnet_sch_hu100 -from jobs.implnet_jobs_aiannh0 import implnet_job_aiannh0 -from sch.implnet_sch_aiannh0 import implnet_sch_aiannh0 -from jobs.implnet_jobs_hu020 import implnet_job_hu020 -from sch.implnet_sch_hu020 import implnet_sch_hu020 -from jobs.implnet_jobs_mainstems0 import implnet_job_mainstems0 -from sch.implnet_sch_mainstems0 import implnet_sch_mainstems0 -from jobs.implnet_jobs_places0 import implnet_job_places0 -from sch.implnet_sch_places0 import implnet_sch_places0 -from jobs.implnet_jobs_hmw0 import implnet_job_hmw0 -from sch.implnet_sch_hmw0 import implnet_sch_hmw0 -from jobs.implnet_jobs_hmw1 import implnet_job_hmw1 -from sch.implnet_sch_hmw1 import implnet_sch_hmw1 -from jobs.implnet_jobs_huc12pp0 import implnet_job_huc12pp0 -from sch.implnet_sch_huc12pp0 import implnet_sch_huc12pp0 -from jobs.implnet_jobs_huc12pp1 import implnet_job_huc12pp1 -from sch.implnet_sch_huc12pp1 import implnet_sch_huc12pp1 -from jobs.implnet_jobs_nmwdiose3 import implnet_job_nmwdiose3 -from sch.implnet_sch_nmwdiose3 import implnet_sch_nmwdiose3 -from jobs.implnet_jobs_nmwdiose2 import implnet_job_nmwdiose2 -from sch.implnet_sch_nmwdiose2 import implnet_sch_nmwdiose2 -from jobs.implnet_jobs_nmwdiose0 import implnet_job_nmwdiose0 -from sch.implnet_sch_nmwdiose0 import implnet_sch_nmwdiose0 -from jobs.implnet_jobs_nmwdiose4 import implnet_job_nmwdiose4 -from sch.implnet_sch_nmwdiose4 import implnet_sch_nmwdiose4 -from jobs.implnet_jobs_nmwdiose1 import implnet_job_nmwdiose1 -from sch.implnet_sch_nmwdiose1 import implnet_sch_nmwdiose1 -from jobs.implnet_jobs_nmwdist0 import implnet_job_nmwdist0 -from sch.implnet_sch_nmwdist0 import implnet_sch_nmwdist0 -from jobs.implnet_jobs_selfieids0 import implnet_job_selfieids0 -from sch.implnet_sch_selfieids0 import implnet_sch_selfieids0 -from jobs.implnet_jobs_chyldpilotids0 import implnet_job_chyldpilotids0 -from sch.implnet_sch_chyldpilotids0 import implnet_sch_chyldpilotids0 -from jobs.implnet_jobs_rise0 import implnet_job_rise0 -from sch.implnet_sch_rise0 import implnet_sch_rise0 -from jobs.implnet_jobs_autotest10 import implnet_job_autotest10 -from sch.implnet_sch_autotest10 import implnet_sch_autotest10 -from jobs.implnet_jobs_links0 import implnet_job_links0 -from sch.implnet_sch_links0 import implnet_sch_links0 -from jobs.implnet_jobs_demo0 import implnet_job_demo0 -from sch.implnet_sch_demo0 import implnet_sch_demo0 -from jobs.implnet_jobs_autotest20 import implnet_job_autotest20 -from sch.implnet_sch_autotest20 import implnet_sch_autotest20 -from jobs.implnet_jobs_wade2 import implnet_job_wade2 -from sch.implnet_sch_wade2 import implnet_sch_wade2 -from jobs.implnet_jobs_wade0 import implnet_job_wade0 -from sch.implnet_sch_wade0 import implnet_sch_wade0 -from jobs.implnet_jobs_wade17 import implnet_job_wade17 -from sch.implnet_sch_wade17 import implnet_sch_wade17 -from jobs.implnet_jobs_wade9 import implnet_job_wade9 -from sch.implnet_sch_wade9 import implnet_sch_wade9 -from jobs.implnet_jobs_wade7 import implnet_job_wade7 -from sch.implnet_sch_wade7 import implnet_sch_wade7 -from jobs.implnet_jobs_wade3 import implnet_job_wade3 -from sch.implnet_sch_wade3 import implnet_sch_wade3 -from jobs.implnet_jobs_wade15 import implnet_job_wade15 -from sch.implnet_sch_wade15 import implnet_sch_wade15 -from jobs.implnet_jobs_wade5 import implnet_job_wade5 -from sch.implnet_sch_wade5 import implnet_sch_wade5 -from jobs.implnet_jobs_wade10 import implnet_job_wade10 -from sch.implnet_sch_wade10 import implnet_sch_wade10 -from jobs.implnet_jobs_wade14 import implnet_job_wade14 -from sch.implnet_sch_wade14 import implnet_sch_wade14 -from jobs.implnet_jobs_wade18 import implnet_job_wade18 -from sch.implnet_sch_wade18 import implnet_sch_wade18 -from jobs.implnet_jobs_wade13 import implnet_job_wade13 -from sch.implnet_sch_wade13 import implnet_sch_wade13 -from jobs.implnet_jobs_wade8 import implnet_job_wade8 -from sch.implnet_sch_wade8 import implnet_sch_wade8 -from jobs.implnet_jobs_wade19 import implnet_job_wade19 -from sch.implnet_sch_wade19 import implnet_sch_wade19 -from jobs.implnet_jobs_wade12 import implnet_job_wade12 -from sch.implnet_sch_wade12 import implnet_sch_wade12 -from jobs.implnet_jobs_wade4 import implnet_job_wade4 -from sch.implnet_sch_wade4 import implnet_sch_wade4 -from jobs.implnet_jobs_wade16 import implnet_job_wade16 -from sch.implnet_sch_wade16 import implnet_sch_wade16 -from jobs.implnet_jobs_wade1 import implnet_job_wade1 -from sch.implnet_sch_wade1 import implnet_sch_wade1 -from jobs.implnet_jobs_wade6 import implnet_job_wade6 -from sch.implnet_sch_wade6 import implnet_sch_wade6 -from jobs.implnet_jobs_wade11 import implnet_job_wade11 -from sch.implnet_sch_wade11 import implnet_sch_wade11 - -@repository -def gleaner(): - jobs = [implnet_job_nwisgw20, implnet_job_nwisgw22, implnet_job_nwisgw16, implnet_job_nwisgw12, implnet_job_nwisgw25, implnet_job_nwisgw14, implnet_job_nwisgw23, implnet_job_nwisgw10, implnet_job_nwisgw15, implnet_job_nwisgw2, implnet_job_nwisgw24, implnet_job_nwisgw9, implnet_job_nwisgw19, implnet_job_nwisgw28, implnet_job_nwisgw26, implnet_job_nwisgw5, implnet_job_nwisgw13, implnet_job_nwisgw6, implnet_job_nwisgw3, implnet_job_nwisgw4, implnet_job_nwisgw1, implnet_job_nwisgw21, implnet_job_nwisgw27, implnet_job_nwisgw8, implnet_job_nwisgw17, implnet_job_nwisgw18, implnet_job_nwisgw7, implnet_job_nwisgw11, implnet_job_nwisgw0, implnet_job_nwissite1, implnet_job_nwissite3, implnet_job_nwissite0, implnet_job_nwissite2, implnet_job_gfv11pois1, implnet_job_gfv11pois0, implnet_job_hydrologicunit0, implnet_job_damspids0, implnet_job_cuahsihishydrodataczhrids0, implnet_job_cuahsihisnooksackmicroclimatenetworkids0, implnet_job_cuahsihisneonids0, implnet_job_cuahsihisglobalriversobservatoryids0, implnet_job_cuahsihistncwaterdataids0, implnet_job_cuahsihisscotlandnwisids0, implnet_job_cuahsihisczoboulderids0, implnet_job_cuahsihisyosemitehydroclimatenetworkids0, implnet_job_cuahsihismuddyriverids0, implnet_job_cuahsihisczomercedids0, implnet_job_cuahsihisghcnids0, implnet_job_cuahsihismmaatacamaids0, implnet_job_cuahsihisumbcwqids0, implnet_job_cuahsihisgleonlakeannieids0, implnet_job_cuahsihisluwlids0, implnet_job_cuahsihiscedarriverids0, implnet_job_cuahsihisccbepdapids0, implnet_job_cuahsihiskansasweatherdataids0, implnet_job_cuahsihisodmkentstateids0, implnet_job_cuahsihisgleondorsetids0, implnet_job_cuahsihisclarksburgspids0, implnet_job_cuahsihiscrwaids0, implnet_job_cuahsihiscuisoids0, implnet_job_cuahsihisprovorivergamutids0, implnet_job_cuahsihisirwaids0, implnet_job_cuahsihisczoluquilloids0, implnet_job_cuahsihistuolumnemdwids0, implnet_job_cuahsihisrmblids0, implnet_job_cuahsihispanolaodmids0, implnet_job_cuahsihisnewnids0, implnet_job_cuahsihisczoudelids0, implnet_job_cuahsihisfarmrwaids0, implnet_job_cuahsihisskcmilltownids0, implnet_job_cuahsihisumbcgwids0, implnet_job_cuahsihisshalenetworkodmids0, implnet_job_cuahsihisnevadosids0, implnet_job_cuahsihisweiherbachids0, implnet_job_cuahsihismazarriverprojectids0, implnet_job_cuahsihisgleonsunapeeids0, implnet_job_cuahsihisorsancohabids0, implnet_job_cuahsihismwraids0, implnet_job_cuahsihismaaeriids0, implnet_job_cuahsihisnceiww2ids0, implnet_job_cuahsihistarlandwaterqualityids0, implnet_job_cuahsihislczoodm2ids0, implnet_job_cuahsihiscocorahsids0, implnet_job_cuahsihisparalanaturalezaids0, implnet_job_cuahsihisczocatalinaids0, implnet_job_cuahsihisieeratwilkesuniversityids0, implnet_job_cuahsihismudlakeids0, implnet_job_cuahsihismwdisids0, implnet_job_cuahsihisloganriverids0, implnet_job_cuahsihisscanids0, implnet_job_cuahsihisnashrwaids0, implnet_job_cuahsihismobilecrowdhydrologyids0, implnet_job_cuahsihisandrewsforestlterids0, implnet_job_cuahsihisloganrivergamutids0, implnet_job_cuahsihislittlebearriverids0, implnet_job_cuahsihislterntlwoodruffids0, implnet_job_cuahsihissagehencreekids0, implnet_job_cuahsihisshalenetworkodmids1, implnet_job_cuahsihisfrcwqmids0, implnet_job_cuahsihishydrodataczdids0, implnet_job_cuahsihisdrwiids0, implnet_job_cuahsihisubwpadids0, implnet_job_cuahsihistrwaids0, implnet_job_cuahsihisredbuttecreekgamutids0, implnet_job_cuahsihisglacialridgeids0, implnet_job_cuahsihisfcelterids0, implnet_job_cuahsihisczoarizids0, implnet_job_cuahsihiscalvinhhsids0, implnet_job_cuahsihissnotelids0, implnet_job_cuahsihisnevcanids0, implnet_job_cuahsihisczopsuids0, implnet_job_cuahsihisbrazilucbids0, implnet_job_cuahsihisgleonauburnids0, implnet_job_cuahsihislaselvastreamdischargeids0, implnet_job_cuahsihisisbenaids0, implnet_job_cuahsihisswedishmonitoringdataids0, implnet_job_cuahsihisunhsnowids0, implnet_job_cuahsihishassbergeids0, implnet_job_cuahsihisnhgswofids0, implnet_job_cuahsihisgonggaids0, implnet_job_cuahsihismopexids0, implnet_job_cagagespids0, implnet_job_sechydrgreg0, implnet_job_counties0, implnet_job_pws0, implnet_job_hu060, implnet_job_nataq0, implnet_job_cbsa0, implnet_job_hu080, implnet_job_hu040, implnet_job_princiaq0, implnet_job_refgage0, implnet_job_refgage3, implnet_job_refgage2, implnet_job_refgage1, implnet_job_dams0, implnet_job_dams1, implnet_job_ua100, implnet_job_states0, implnet_job_hu100, implnet_job_aiannh0, implnet_job_hu020, implnet_job_mainstems0, implnet_job_places0, implnet_job_hmw0, implnet_job_hmw1, implnet_job_huc12pp0, implnet_job_huc12pp1, implnet_job_nmwdiose3, implnet_job_nmwdiose2, implnet_job_nmwdiose0, implnet_job_nmwdiose4, implnet_job_nmwdiose1, implnet_job_nmwdist0, implnet_job_selfieids0, implnet_job_chyldpilotids0, implnet_job_rise0, implnet_job_autotest10, implnet_job_links0, implnet_job_demo0, implnet_job_autotest20, implnet_job_wade2, implnet_job_wade0, implnet_job_wade17, implnet_job_wade9, implnet_job_wade7, implnet_job_wade3, implnet_job_wade15, implnet_job_wade5, implnet_job_wade10, implnet_job_wade14, implnet_job_wade18, implnet_job_wade13, implnet_job_wade8, implnet_job_wade19, implnet_job_wade12, implnet_job_wade4, implnet_job_wade16, implnet_job_wade1, implnet_job_wade6, implnet_job_wade11] - schedules = [implnet_sch_nwisgw20, implnet_sch_nwisgw22, implnet_sch_nwisgw16, implnet_sch_nwisgw12, implnet_sch_nwisgw25, implnet_sch_nwisgw14, implnet_sch_nwisgw23, implnet_sch_nwisgw10, implnet_sch_nwisgw15, implnet_sch_nwisgw2, implnet_sch_nwisgw24, implnet_sch_nwisgw9, implnet_sch_nwisgw19, implnet_sch_nwisgw28, implnet_sch_nwisgw26, implnet_sch_nwisgw5, implnet_sch_nwisgw13, implnet_sch_nwisgw6, implnet_sch_nwisgw3, implnet_sch_nwisgw4, implnet_sch_nwisgw1, implnet_sch_nwisgw21, implnet_sch_nwisgw27, implnet_sch_nwisgw8, implnet_sch_nwisgw17, implnet_sch_nwisgw18, implnet_sch_nwisgw7, implnet_sch_nwisgw11, implnet_sch_nwisgw0, implnet_sch_nwissite1, implnet_sch_nwissite3, implnet_sch_nwissite0, implnet_sch_nwissite2, implnet_sch_gfv11pois1, implnet_sch_gfv11pois0, implnet_sch_hydrologicunit0, implnet_sch_damspids0, implnet_sch_cuahsihishydrodataczhrids0, implnet_sch_cuahsihisnooksackmicroclimatenetworkids0, implnet_sch_cuahsihisneonids0, implnet_sch_cuahsihisglobalriversobservatoryids0, implnet_sch_cuahsihistncwaterdataids0, implnet_sch_cuahsihisscotlandnwisids0, implnet_sch_cuahsihisczoboulderids0, implnet_sch_cuahsihisyosemitehydroclimatenetworkids0, implnet_sch_cuahsihismuddyriverids0, implnet_sch_cuahsihisczomercedids0, implnet_sch_cuahsihisghcnids0, implnet_sch_cuahsihismmaatacamaids0, implnet_sch_cuahsihisumbcwqids0, implnet_sch_cuahsihisgleonlakeannieids0, implnet_sch_cuahsihisluwlids0, implnet_sch_cuahsihiscedarriverids0, implnet_sch_cuahsihisccbepdapids0, implnet_sch_cuahsihiskansasweatherdataids0, implnet_sch_cuahsihisodmkentstateids0, implnet_sch_cuahsihisgleondorsetids0, implnet_sch_cuahsihisclarksburgspids0, implnet_sch_cuahsihiscrwaids0, implnet_sch_cuahsihiscuisoids0, implnet_sch_cuahsihisprovorivergamutids0, implnet_sch_cuahsihisirwaids0, implnet_sch_cuahsihisczoluquilloids0, implnet_sch_cuahsihistuolumnemdwids0, implnet_sch_cuahsihisrmblids0, implnet_sch_cuahsihispanolaodmids0, implnet_sch_cuahsihisnewnids0, implnet_sch_cuahsihisczoudelids0, implnet_sch_cuahsihisfarmrwaids0, implnet_sch_cuahsihisskcmilltownids0, implnet_sch_cuahsihisumbcgwids0, implnet_sch_cuahsihisshalenetworkodmids0, implnet_sch_cuahsihisnevadosids0, implnet_sch_cuahsihisweiherbachids0, implnet_sch_cuahsihismazarriverprojectids0, implnet_sch_cuahsihisgleonsunapeeids0, implnet_sch_cuahsihisorsancohabids0, implnet_sch_cuahsihismwraids0, implnet_sch_cuahsihismaaeriids0, implnet_sch_cuahsihisnceiww2ids0, implnet_sch_cuahsihistarlandwaterqualityids0, implnet_sch_cuahsihislczoodm2ids0, implnet_sch_cuahsihiscocorahsids0, implnet_sch_cuahsihisparalanaturalezaids0, implnet_sch_cuahsihisczocatalinaids0, implnet_sch_cuahsihisieeratwilkesuniversityids0, implnet_sch_cuahsihismudlakeids0, implnet_sch_cuahsihismwdisids0, implnet_sch_cuahsihisloganriverids0, implnet_sch_cuahsihisscanids0, implnet_sch_cuahsihisnashrwaids0, implnet_sch_cuahsihismobilecrowdhydrologyids0, implnet_sch_cuahsihisandrewsforestlterids0, implnet_sch_cuahsihisloganrivergamutids0, implnet_sch_cuahsihislittlebearriverids0, implnet_sch_cuahsihislterntlwoodruffids0, implnet_sch_cuahsihissagehencreekids0, implnet_sch_cuahsihisshalenetworkodmids1, implnet_sch_cuahsihisfrcwqmids0, implnet_sch_cuahsihishydrodataczdids0, implnet_sch_cuahsihisdrwiids0, implnet_sch_cuahsihisubwpadids0, implnet_sch_cuahsihistrwaids0, implnet_sch_cuahsihisredbuttecreekgamutids0, implnet_sch_cuahsihisglacialridgeids0, implnet_sch_cuahsihisfcelterids0, implnet_sch_cuahsihisczoarizids0, implnet_sch_cuahsihiscalvinhhsids0, implnet_sch_cuahsihissnotelids0, implnet_sch_cuahsihisnevcanids0, implnet_sch_cuahsihisczopsuids0, implnet_sch_cuahsihisbrazilucbids0, implnet_sch_cuahsihisgleonauburnids0, implnet_sch_cuahsihislaselvastreamdischargeids0, implnet_sch_cuahsihisisbenaids0, implnet_sch_cuahsihisswedishmonitoringdataids0, implnet_sch_cuahsihisunhsnowids0, implnet_sch_cuahsihishassbergeids0, implnet_sch_cuahsihisnhgswofids0, implnet_sch_cuahsihisgonggaids0, implnet_sch_cuahsihismopexids0, implnet_sch_cagagespids0, implnet_sch_sechydrgreg0, implnet_sch_counties0, implnet_sch_pws0, implnet_sch_hu060, implnet_sch_nataq0, implnet_sch_cbsa0, implnet_sch_hu080, implnet_sch_hu040, implnet_sch_princiaq0, implnet_sch_refgage0, implnet_sch_refgage3, implnet_sch_refgage2, implnet_sch_refgage1, implnet_sch_dams0, implnet_sch_dams1, implnet_sch_ua100, implnet_sch_states0, implnet_sch_hu100, implnet_sch_aiannh0, implnet_sch_hu020, implnet_sch_mainstems0, implnet_sch_places0, implnet_sch_hmw0, implnet_sch_hmw1, implnet_sch_huc12pp0, implnet_sch_huc12pp1, implnet_sch_nmwdiose3, implnet_sch_nmwdiose2, implnet_sch_nmwdiose0, implnet_sch_nmwdiose4, implnet_sch_nmwdiose1, implnet_sch_nmwdist0, implnet_sch_selfieids0, implnet_sch_chyldpilotids0, implnet_sch_rise0, implnet_sch_autotest10, implnet_sch_links0, implnet_sch_demo0, implnet_sch_autotest20, implnet_sch_wade2, implnet_sch_wade0, implnet_sch_wade17, implnet_sch_wade9, implnet_sch_wade7, implnet_sch_wade3, implnet_sch_wade15, implnet_sch_wade5, implnet_sch_wade10, implnet_sch_wade14, implnet_sch_wade18, implnet_sch_wade13, implnet_sch_wade8, implnet_sch_wade19, implnet_sch_wade12, implnet_sch_wade4, implnet_sch_wade16, implnet_sch_wade1, implnet_sch_wade6, implnet_sch_wade11] - - - return jobs + schedules diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_africaioc.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_africaioc.py deleted file mode 100644 index 833bcffe..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_africaioc.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_africaioc import implnet_job_africaioc - -@schedule(cron_schedule="0 0 * * 0", job=implnet_job_africaioc, execution_timezone="US/Central") -def implnet_sch_africaioc(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_aquadocs.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_aquadocs.py deleted file mode 100644 index 7ffe1dc9..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_aquadocs.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_aquadocs import implnet_job_aquadocs - -@schedule(cron_schedule="0 6 * * 0", job=implnet_job_aquadocs, execution_timezone="US/Central") -def implnet_sch_aquadocs(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_benguelacc.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_benguelacc.py deleted file mode 100644 index 951c20fc..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_benguelacc.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_benguelacc import implnet_job_benguelacc - -@schedule(cron_schedule="0 12 * * 0", job=implnet_job_benguelacc, execution_timezone="US/Central") -def implnet_sch_benguelacc(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_caribbeanmarineatlas.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_caribbeanmarineatlas.py deleted file mode 100644 index 74a73d41..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_caribbeanmarineatlas.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_caribbeanmarineatlas import implnet_job_caribbeanmarineatlas - -@schedule(cron_schedule="0 18 * * 0", job=implnet_job_caribbeanmarineatlas, execution_timezone="US/Central") -def implnet_sch_caribbeanmarineatlas(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_cioos.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_cioos.py deleted file mode 100644 index bd0c283e..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_cioos.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_cioos import implnet_job_cioos - -@schedule(cron_schedule="0 0 * * 1", job=implnet_job_cioos, execution_timezone="US/Central") -def implnet_sch_cioos(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_edmerp.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_edmerp.py deleted file mode 100644 index c0d3ccc8..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_edmerp.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_edmerp import implnet_job_edmerp - -@schedule(cron_schedule="0 6 * * 1", job=implnet_job_edmerp, execution_timezone="US/Central") -def implnet_sch_edmerp(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_edmo.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_edmo.py deleted file mode 100644 index 9082898e..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_edmo.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_edmo import implnet_job_edmo - -@schedule(cron_schedule="0 12 * * 1", job=implnet_job_edmo, execution_timezone="US/Central") -def implnet_sch_edmo(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_emodnet.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_emodnet.py deleted file mode 100644 index 3f4352d8..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_emodnet.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_emodnet import implnet_job_emodnet - -@schedule(cron_schedule="0 12 * * 3", job=implnet_job_emodnet, execution_timezone="US/Central") -def implnet_sch_emodnet(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanevents.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanevents.py deleted file mode 100644 index 45642ea0..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanevents.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_euroceanevents import implnet_job_euroceanevents - -@schedule(cron_schedule="0 18 * * 1", job=implnet_job_euroceanevents, execution_timezone="US/Central") -def implnet_sch_euroceanevents(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanexperts.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanexperts.py deleted file mode 100644 index 61dfa79a..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanexperts.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_euroceanexperts import implnet_job_euroceanexperts - -@schedule(cron_schedule="0 0 * * 2", job=implnet_job_euroceanexperts, execution_timezone="US/Central") -def implnet_sch_euroceanexperts(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceaninstitutions.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceaninstitutions.py deleted file mode 100644 index 79a2d707..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceaninstitutions.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_euroceaninstitutions import implnet_job_euroceaninstitutions - -@schedule(cron_schedule="0 6 * * 2", job=implnet_job_euroceaninstitutions, execution_timezone="US/Central") -def implnet_sch_euroceaninstitutions(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanorgs.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanorgs.py deleted file mode 100644 index 07a5fad9..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanorgs.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_euroceanorgs import implnet_job_euroceanorgs - -@schedule(cron_schedule="0 12 * * 2", job=implnet_job_euroceanorgs, execution_timezone="US/Central") -def implnet_sch_euroceanorgs(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanprojects.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanprojects.py deleted file mode 100644 index fc55bf23..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanprojects.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_euroceanprojects import implnet_job_euroceanprojects - -@schedule(cron_schedule="0 18 * * 2", job=implnet_job_euroceanprojects, execution_timezone="US/Central") -def implnet_sch_euroceanprojects(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceantraining.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceantraining.py deleted file mode 100644 index 739929ce..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceantraining.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_euroceantraining import implnet_job_euroceantraining - -@schedule(cron_schedule="0 0 * * 3", job=implnet_job_euroceantraining, execution_timezone="US/Central") -def implnet_sch_euroceantraining(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanvessels.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanvessels.py deleted file mode 100644 index a72e6218..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_euroceanvessels.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_euroceanvessels import implnet_job_euroceanvessels - -@schedule(cron_schedule="0 6 * * 3", job=implnet_job_euroceanvessels, execution_timezone="US/Central") -def implnet_sch_euroceanvessels(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_inanodc.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_inanodc.py deleted file mode 100644 index cf85b20f..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_inanodc.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_inanodc import implnet_job_inanodc - -@schedule(cron_schedule="0 18 * * 3", job=implnet_job_inanodc, execution_timezone="US/Central") -def implnet_sch_inanodc(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemardocuments.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemardocuments.py deleted file mode 100644 index 03fc837a..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemardocuments.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_invemardocuments import implnet_job_invemardocuments - -@schedule(cron_schedule="0 0 * * 4", job=implnet_job_invemardocuments, execution_timezone="US/Central") -def implnet_sch_invemardocuments(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemarexperts.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemarexperts.py deleted file mode 100644 index 08f65c09..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemarexperts.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_invemarexperts import implnet_job_invemarexperts - -@schedule(cron_schedule="0 6 * * 4", job=implnet_job_invemarexperts, execution_timezone="US/Central") -def implnet_sch_invemarexperts(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemarinstitutions.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemarinstitutions.py deleted file mode 100644 index 43070d73..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemarinstitutions.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_invemarinstitutions import implnet_job_invemarinstitutions - -@schedule(cron_schedule="0 12 * * 4", job=implnet_job_invemarinstitutions, execution_timezone="US/Central") -def implnet_sch_invemarinstitutions(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemartraining.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemartraining.py deleted file mode 100644 index 24ef2740..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemartraining.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_invemartraining import implnet_job_invemartraining - -@schedule(cron_schedule="0 18 * * 4", job=implnet_job_invemartraining, execution_timezone="US/Central") -def implnet_sch_invemartraining(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemarvessels.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemarvessels.py deleted file mode 100644 index 6baaab09..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_invemarvessels.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_invemarvessels import implnet_job_invemarvessels - -@schedule(cron_schedule="0 0 * * 5", job=implnet_job_invemarvessels, execution_timezone="US/Central") -def implnet_sch_invemarvessels(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_marinetraining.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_marinetraining.py deleted file mode 100644 index ae234b6f..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_marinetraining.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_marinetraining import implnet_job_marinetraining - -@schedule(cron_schedule="0 6 * * 5", job=implnet_job_marinetraining, execution_timezone="US/Central") -def implnet_sch_marinetraining(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_maspawio.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_maspawio.py deleted file mode 100644 index b6f09ed9..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_maspawio.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_maspawio import implnet_job_maspawio - -@schedule(cron_schedule="0 12 * * 5", job=implnet_job_maspawio, execution_timezone="US/Central") -def implnet_sch_maspawio(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_obis.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_obis.py deleted file mode 100644 index b8116bee..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_obis.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_obis import implnet_job_obis - -@schedule(cron_schedule="0 18 * * 5", job=implnet_job_obis, execution_timezone="US/Central") -def implnet_sch_obis(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_obps.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_obps.py deleted file mode 100644 index 8b3ff4f5..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_obps.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_obps import implnet_job_obps - -@schedule(cron_schedule="0 0 * * 6", job=implnet_job_obps, execution_timezone="US/Central") -def implnet_sch_obps(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_oceanexperts.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_oceanexperts.py deleted file mode 100644 index db84bdfb..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_oceanexperts.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_oceanexperts import implnet_job_oceanexperts - -@schedule(cron_schedule="0 6 * * 6", job=implnet_job_oceanexperts, execution_timezone="US/Central") -def implnet_sch_oceanexperts(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_oceanscape.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_oceanscape.py deleted file mode 100644 index 4d2438bf..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_oceanscape.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_oceanscape import implnet_job_oceanscape - -@schedule(cron_schedule="0 12 * * 6", job=implnet_job_oceanscape, execution_timezone="US/Central") -def implnet_sch_oceanscape(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_pdh.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_pdh.py deleted file mode 100644 index 9920619d..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_pdh.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_pdh import implnet_job_pdh - -@schedule(cron_schedule="0 18 * * 6", job=implnet_job_pdh, execution_timezone="US/Central") -def implnet_sch_pdh(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_pogo.py b/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_pogo.py deleted file mode 100644 index d499c042..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/schedules/implnet_sch_pogo.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_pogo import implnet_job_pogo - -@schedule(cron_schedule="0 0 * * 0", job=implnet_job_pogo, execution_timezone="US/Central") -def implnet_sch_pogo(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/generatedCode/implnet-oih/output/workspace.yaml b/dagster/implnets/generatedCode/implnet-oih/output/workspace.yaml deleted file mode 100644 index 54490e1d..00000000 --- a/dagster/implnets/generatedCode/implnet-oih/output/workspace.yaml +++ /dev/null @@ -1,4 +0,0 @@ -load_from: - - python_file: - relative_path: "repositories/repository.py" - working_directory: . \ No newline at end of file diff --git a/dagster/implnets/pygen.py b/dagster/implnets/pygen.py index ae8bc80a..fa61442c 100644 --- a/dagster/implnets/pygen.py +++ b/dagster/implnets/pygen.py @@ -6,7 +6,7 @@ import fileinput import re import shutil - +import pydash # python pygen.py -cf ../../configs/oih/gleanerconfig.yaml -od ./output -td ./templates @@ -24,8 +24,9 @@ def gencode(cf, od, td, days) -> str: inc = round(hours / len(c["sources"])) # divide hours we want to run over by number of source to get increment print("index event every {} hours over {} day(s) period for {} items".format(inc, days, len(c["sources"]))) - - for i, s in enumerate(c["sources"]): + sources = pydash.union_by(c["sources"], lambda source: source["name"]) + #for i, s in enumerate(c["sources"]): + for i, s in enumerate(sources): # could put an if statement here for those that are active # print(s["name"]) diff --git a/dagster/implnets/requirements.txt b/dagster/implnets/requirements.txt index e7a319c9..bf6e8ff1 100644 --- a/dagster/implnets/requirements.txt +++ b/dagster/implnets/requirements.txt @@ -1,17 +1,25 @@ -dagit>=1.4.2 -dagster-postgres==0.20.2 -dagster>=1.4.2 -dagster-webserver>=1.4.2 -dagster-docker -dagster-aws +dagit>=1.7.10 +dagster-postgres>=0.23.10 +dagster>=1.7.10 +dagster-graphql>=1.7.10 +dagster-webserver>=1.7.10 +dagster-docker>=0.23.10 +dagster-aws>=0.23.10 +dagster_slack>=0.23.10 ipython-genutils==0.2.0 advertools==0.13.2 minio==7.1.13 docker>=6.1.0 +dagstermill>=0.23.10 +notebook +pydash +pyyaml +orjson + +#earthcube-utilities>=0.1.26 +earthcube-utilities @ git+https://github.com/earthcube/earthcube_utilities@dev#egg=earthcube_utilities&subdirectory=earthcube_utilities -earthcube-utilities>=0.1.18 -#earthcube-utilities @ git+https://github.com/earthcube/earthcube_utilities@b671efb#subdirectory=earthcube_utilities # if we want to use an non-released branch 2c1dcab is the commit # earthcube-utilities @ git+https://github.com/earthcube/earthcube_utilities@2c1dcab#subdirectory=earthcube_utilities diff --git a/dagster/implnets/requirements_code.txt b/dagster/implnets/requirements_code.txt deleted file mode 100644 index 445ea1e9..00000000 --- a/dagster/implnets/requirements_code.txt +++ /dev/null @@ -1,22 +0,0 @@ - -# this includes files that are used in the code server, -# eg requirements of a code run on dagster -# needs the dagster code for the dagster api grpc call, -# but do not think we need the webserver/dagit -#dagit>=1.4.2 -dagster-postgres>=0.20.2 -dagster>=1.4.2 -#dagster-webserver>=1.4.2 -dagster-docker -dagster-aws -ipython-genutils==0.2.0 -advertools==0.13.2 -minio==7.1.13 -docker>=6.1.0 - -earthcube-utilities>=0.1.18 -#earthcube-utilities @ git+https://github.com/earthcube/earthcube_utilities@b671efb#subdirectory=earthcube_utilities -# if we want to use an non-released branch 2c1dcab is the commit -# earthcube-utilities @ git+https://github.com/earthcube/earthcube_utilities@2c1dcab#subdirectory=earthcube_utilities - - diff --git a/dagster/implnets/requirements_dagster.txt b/dagster/implnets/requirements_dagster.txt index 1629762e..e21687a5 100644 --- a/dagster/implnets/requirements_dagster.txt +++ b/dagster/implnets/requirements_dagster.txt @@ -1,10 +1,11 @@ - -dagit>=1.4.2 -dagster-postgres>=0.20.2 -dagster>=1.4.2 -dagster-webserver>=1.4.2 -dagster-docker -dagster-aws +dagit>=1.7.7 +dagster-postgres>=0.23.7 +dagster>=1.7.7 +dagster-webserver>=1.7.7 +dagster-docker>=0.23.7 +dagster-aws>=0.23.7 +dagstermill>=0.23.7 +dagster_slack>=0.23.7 ### # this are modules that are not part of dagster base image # ipython-genutils==0.2.0 @@ -12,7 +13,7 @@ dagster-aws # minio==7.1.13 # docker>=6.1.0 # -# earthcube-utilities>=0.1.17 +# earthcube-utilities>=0.1.25 diff --git a/dagster/implnets/templates/v1/implnet_ops_SOURCEVAL.py b/dagster/implnets/templates/v1/implnet_ops_SOURCEVAL.py index 9618feef..d4a335e9 100644 --- a/dagster/implnets/templates/v1/implnet_ops_SOURCEVAL.py +++ b/dagster/implnets/templates/v1/implnet_ops_SOURCEVAL.py @@ -1,7 +1,9 @@ +import csv import distutils import logging import time +import pandas from dagster import job, op, graph,In, Nothing, get_dagster_logger import os, json, io import urllib @@ -12,10 +14,12 @@ from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace import json +from ec.graph.release_graph import ReleaseGraph from minio import Minio from minio.error import S3Error from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo +from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo, \ + generateGraphReportsRelease from ec.datastore import s3 from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset from ec.graph.manageGraph import ManageBlazegraph as mg @@ -36,19 +40,19 @@ from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') +DEBUG=(os.getenv('DEBUG_CONTAINER', 'False').lower() == 'true') # # # path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) # WHEN RUNNING dagster-dev, this needs to be a path to a local file ## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") +DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('GLEANERIO_DAGSTER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") # Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") +GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_DOCKER_HEADLESS_NETWORK', "headless_gleanerio") # env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - +URL = os.environ.get('GLEANERIO_DOCKER_URL') +APIKEY = os.environ.get('GLEANERIO_PORTAINER_APIKEY') +CONTAINER_WAIT_TIMEOUT= int( os.environ.get('GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT',300)) GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) @@ -77,10 +81,11 @@ GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) +GLEANERIO_DOCKER_GLEANER_CONFIG=str(os.environ.get('GLEANERIO_DOCKER_GLEANER_CONFIG', 'gleaner')) +GLEANERIO_DOCKER_NABU_CONFIG=str(os.environ.get('GLEANERIO_DOCKER_NABU_CONFIG', 'nabu')) #GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) +GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_GRAPH_SUMMARY_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) +GLEANERIO_SUMMARIZE_GRAPH=(os.getenv('GLEANERIO_GRAPH_SUMMARIZE', 'False').lower() == 'true') SUMMARY_PATH = 'graphs/summary' RELEASE_PATH = 'graphs/latest' @@ -138,7 +143,7 @@ def s3reader(object): get_dagster_logger().info(f"S3 read error : {str(err)}") -def s3loader(data, name): +def s3_log_uploader(data, name, date_string=datetime.now().strftime("%Y_%m_%d_%H_%M_%S")): secure= GLEANER_MINIO_USE_SSL server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) @@ -158,8 +163,8 @@ def s3loader(data, name): # else: # print("Bucket 'X' already exists") - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") + # now = datetime.now() + # date_string = now.strftime("%Y_%m_%d_%H_%M_%S") logname = name + '_{}.log'.format(date_string) objPrefix = GLEANERIO_LOG_PREFIX + logname @@ -167,24 +172,38 @@ def s3loader(data, name): #length = f.write(bytes(json_str, 'utf-8')) length = f.write(data) f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) + try: + client.put_object(GLEANER_MINIO_BUCKET, + objPrefix, + f, #io.BytesIO(data), + length, #len(data), + content_type="text/plain" + ) + get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") + except Exception as ex: + get_dagster_logger().error(f"Log uploaded failed: {str(objPrefix)}") + +def _releaseUrl( source, path=RELEASE_PATH, extension="nq"): proto = "http" - if GLEANER_MINIO_USE_SSL: proto = "https" - port = GLEANER_MINIO_PORT address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) bucket = GLEANER_MINIO_BUCKET release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" + return release_url + +def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_graphEndpoint()): + # revision of EC utilities, will have a insertFromURL + #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) + # proto = "http" + # + # if GLEANER_MINIO_USE_SSL: + # proto = "https" + # port = GLEANER_MINIO_PORT + # address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) + # bucket = GLEANER_MINIO_BUCKET + # release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" + # BLAZEGRAPH SPECIFIC # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') @@ -201,6 +220,7 @@ def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_grap # get_dagster_logger().info(f'graph: error') # raise Exception(f' graph: insert failed: status:{r.status_code}') + release_url = _releaseUrl(source, path, extension) ### GENERIC LOAD FROM url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" get_dagster_logger().info(f'graph: insert "{source}" to {url} ') @@ -263,13 +283,13 @@ def _create_service( serivce_mode = ServiceMode("replicated-job",concurrency=1,replicas=1) get_dagster_logger().info(str(client.configs.list())) # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) - gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_GLEANER_DOCKER_CONFIG]}) + gleanerconfig = client.configs.list(filters={"name": [GLEANERIO_DOCKER_GLEANER_CONFIG]}) get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") - nabuconfig = client.configs.list(filters={"name":[GLEANERIO_NABU_DOCKER_CONFIG]}) + nabuconfig = client.configs.list(filters={"name":[GLEANERIO_DOCKER_NABU_CONFIG]}) get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") get_dagster_logger().info(f"create docker service for {name}") - gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_GLEANER_DOCKER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) - nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_NABU_DOCKER_CONFIG,GLEANERIO_NABU_CONFIG_PATH) + gleaner = ConfigReference(gleanerconfig[0].id, GLEANERIO_DOCKER_GLEANER_CONFIG,GLEANERIO_GLEANER_CONFIG_PATH) + nabu = ConfigReference(nabuconfig[0].id, GLEANERIO_DOCKER_NABU_CONFIG,GLEANERIO_NABU_CONFIG_PATH) configs = [gleaner,nabu] # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), service = client.services.create( @@ -307,7 +327,7 @@ def gleanerio(context, mode, source): ## ------------ Create returnCode = 0 get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - + date_string = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") if str(mode) == "gleaner": IMAGE =GLEANERIO_GLEANER_IMAGE @@ -424,13 +444,8 @@ def gleanerio(context, mode, source): data["Env"] = enva data["HostConfig"] = { "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data + # docker dagster get_dagster_logger().info(f"start docker code region: ") @@ -468,102 +483,74 @@ def gleanerio(context, mode, source): cid = container.id # legacy til the start get's fixed - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload +# Removed watching the logs, in favor of periodic upload + wait_count = 0 + while True: + wait_count += 1 + try: + container.wait(timeout=CONTAINER_WAIT_TIMEOUT) + exit_status = container.wait()["StatusCode"] + get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") + # WE PULL THE LOGS, then will throw an error + returnCode = exit_status + c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') + + # write to s3 + + s3_log_uploader(str(c).encode(), NAME, date_string=date_string) # s3loader needs a bytes like object + # s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object + # write to minio (would need the minio info here) + + get_dagster_logger().info(f"container Logs to s3: ") +# this needs to be address at some point. https://www.appsloveworld.com/docker/100/85/docker-py-getarchive-destination-folder + path = f"{WorkingDir}/logs" + tar_archive_stream, tar_stat = container.get_archive(path) + archive = bytearray() + for chunk in tar_archive_stream: + archive.extend(chunk) + s3_log_uploader(archive, f"{source}_{mode}_runlogs", date_string=date_string) + get_dagster_logger().info(f"uploaded logs : {source}_{mode}_runlogs to {path}") + break + except requests.exceptions.ReadTimeout as ex: + path = f"{WorkingDir}/logs" + tar_archive_stream, tar_stat = container.get_archive(path) + archive = bytearray() + for chunk in tar_archive_stream: + archive.extend(chunk) + s3_log_uploader(archive, f"{source}_{mode}_runlogs", date_string=date_string) + get_dagster_logger().info(f"uploaded {wait_count}th log : {source}_{mode}_runlogs to {path}") + except docker.errors.APIError as ex: + get_dagster_logger().info(f"Container Wait docker API error : {str(ex)}") + returnCode = 1 + break + if container.status == 'exited' or container.status == 'removed': + get_dagster_logger().info(f"Container exited or removed. status: {container.status}") + exit_status = container.wait()["StatusCode"] + returnCode = exit_status + s3_log_uploader(str(c).encode(), NAME) # s3loader needs a bytes like object + # s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object + # write to minio (would need the minio info here) + + get_dagster_logger().info(f"container Logs to s3: ") + # this needs to be address at some point. https://www.appsloveworld.com/docker/100/85/docker-py-getarchive-destination-folder + path = f"{WorkingDir}/logs" + tar_archive_stream, tar_stat = container.get_archive(path) + archive = bytearray() + for chunk in tar_archive_stream: + archive.extend(chunk) + s3_log_uploader(archive, f"{source}_{mode}_runlogs", date_string=date_string) + get_dagster_logger().info(f"uploaded logs : {source}_{mode}_runlogs to {path}") + break + + # ABOVE Future, need to extraxct files, and upload # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) # pw_tar.extractall("extract_to/") - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) if exit_status != 0: raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") finally: if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") if (service): service.remove() get_dagster_logger().info(f"Service Remove: {service.name}") @@ -572,14 +559,7 @@ def gleanerio(context, mode, source): else: get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") + if (returnCode != 0): get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") @@ -685,14 +665,15 @@ def SOURCEVAL_graph_reports(context) : graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) + + #returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) + s3FileUrl = _releaseUrl(source_name ) + returned_value = generateGraphReportsRelease(source_name,s3FileUrl) r = str('returned value:{}'.format(returned_value)) #report = json.dumps(returned_value, indent=2) # value already json.dumps report = returned_value s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - get_dagster_logger().info(f"graph report returned {r} ") + get_dagster_logger().info(f"graph stats returned {r} ") return @op(ins={"start": In(Nothing)}) @@ -718,8 +699,8 @@ def SOURCEVAL_bucket_urls(context): res = s3Minio.listSummonedUrls(bucket, source_name) r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) + bucketurls = pandas.DataFrame(res).to_csv(index=False, quoting=csv.QUOTE_NONNUMERIC) + s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.csv", bucketurls) get_dagster_logger().info(f"bucker urls report returned {r} ") return @@ -738,7 +719,13 @@ def SOURCEVAL_summarize(context) : try: - summarydf = get_summary4repoSubset(endpoint, source_name) + # summarydf = get_summary4repoSubset(endpoint, source_name) + rg = ReleaseGraph() + rg.read_release(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), + bucket, + source_name, + options=MINIO_OPTIONS) + summarydf = rg.summarize() nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator summaryttl = g.serialize(format='longturtle') # Lets always write out file to s3, and insert as a separate process @@ -791,10 +778,11 @@ def harvest_SOURCEVAL(): # defingin nothing dependencies # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - report_ms3 = SOURCEVAL_missingreport_s3(start=harvest) + report_bucketurl = SOURCEVAL_bucket_urls(start=harvest) + report_ms3 = SOURCEVAL_missingreport_s3(start=report_bucketurl) report_idstat = SOURCEVAL_identifier_stats(start=report_ms3) # for some reason, this causes a msg parameter missing - report_bucketurl = SOURCEVAL_bucket_urls(start=report_idstat) + #report1 = missingreport_s3(harvest, source="SOURCEVAL") load_release = SOURCEVAL_naburelease(start=harvest) @@ -804,11 +792,14 @@ def harvest_SOURCEVAL(): load_prov = SOURCEVAL_nabuprov(start=load_prune) load_org = SOURCEVAL_nabuorg(start=load_prov) - summarize = SOURCEVAL_summarize(start=load_uploadrelease) - upload_summarize = SOURCEVAL_upload_summarize(start=summarize) + if(GLEANERIO_SUMMARIZE_GRAPH): + summarize = SOURCEVAL_summarize(start=load_uploadrelease) + upload_summarize = SOURCEVAL_upload_summarize(start=summarize) + + # run after load - report_msgraph = SOURCEVAL_missingreport_graph(start=summarize) + report_msgraph = SOURCEVAL_missingreport_graph(start=load_prov) report_graph = SOURCEVAL_graph_reports(start=report_msgraph) diff --git a/dagster/implnets/workflows/__init__.py b/dagster/implnets/workflows/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dagster/implnets/workflows/ecrr/NOTE_ECRR.md b/dagster/implnets/workflows/ecrr/NOTE_ECRR.md index 4676f75b..6fafd1ae 100644 --- a/dagster/implnets/workflows/ecrr/NOTE_ECRR.md +++ b/dagster/implnets/workflows/ecrr/NOTE_ECRR.md @@ -1,12 +1,24 @@ +TESTING UPLOADING WITH LOAD: +LOAD +LOAD + ECRR from google drive will require a manual harvest, and manual configuration. +ECRR_SUMBITTED IS THE REPO TO LOAD. + +ECRR_EXAMPLES is a sitemap from the GecodesMetadata repository + +It will not need to be summoned. It will be rsyc'd from the old goodge drive for now, and +later, it will need to read from the s3 bucket where the files are stored by the JSONFORMS app. + You need to generate the code, and modify the deployed config files in s3. -pygen.py -cf ./configs/ecrr/gleanerconfig.yaml -od ./repositories/ecrr -td ./templates/v1 -d 7 +pygen.py -cf ./configs/ecrr/gleanerconfig.yaml -od ./workflows/generated/ecrr -td ./templates/v1 -d 7 + +Then modify the output for the ops files and put into the ecrr folder. -Then modify the output for the ops files GLEANER_MINIO_BUCKET = os.environ.get('ECRR_MINIO_BUCKET') GLEANER_GRAPH_NAMESPACE = os.environ.get('ECRR_GRAPH_NAMESPACE') @@ -17,4 +29,13 @@ Remove the gleaner, missing reporting, identifer, bucket url steps... summarize steps. pass some string to the first nabu step +# RUNNING LOCALLY +* You need to point at a docker STACK, or portainer endpoint... A local workstation docker is usually not a STACK. +* set the ENV variables; ECRR_MINIO_BUCKET ECRR_GRAPH_NAMESPACE +* +`cd workflows/ecrr/ecrr +python -m dagster dev ` +To run a job: +`cd workflows/ecrr/ecrr +python -m dagster job execute -f jobs/implnet_jobs_ecrr_examples.py -j implnet_job_ecrr_examples` diff --git a/dagster/implnets/workflows/ecrr/ecrr/__init__.py b/dagster/implnets/workflows/ecrr/ecrr/__init__.py new file mode 100644 index 00000000..1929c4ef --- /dev/null +++ b/dagster/implnets/workflows/ecrr/ecrr/__init__.py @@ -0,0 +1,20 @@ +from dagster import repository, Definitions +import os +from .jobs.implnet_jobs_ecrr_submitted import job_ecrr_submitted +from .sch.implnet_sch_ecrr_submitted import implnet_sch_ecrr_submitted +from .jobs.implnet_jobs_ecrr_examples import job_ecrr_examples +from .sch.implnet_sch_ecrr_examples import implnet_sch_ecrr_examples + +from dagster_slack import SlackResource, make_slack_on_run_failure_sensor +slack_on_run_failure = make_slack_on_run_failure_sensor( + os.getenv("SLACK_CHANNEL"), + os.getenv("SLACK_TOKEN") +) +jobs = [ job_ecrr_submitted, job_ecrr_examples] +schedules = [ implnet_sch_ecrr_submitted, implnet_sch_ecrr_examples] + +defs = Definitions( + jobs=jobs, + schedules=schedules, + sensors=[slack_on_run_failure] +) diff --git a/dagster/implnets/workflows/ecrr/ecrr/jobs/implnet_jobs_ecrr_examples.py b/dagster/implnets/workflows/ecrr/ecrr/jobs/implnet_jobs_ecrr_examples.py new file mode 100644 index 00000000..6370385a --- /dev/null +++ b/dagster/implnets/workflows/ecrr/ecrr/jobs/implnet_jobs_ecrr_examples.py @@ -0,0 +1,7 @@ +from dagster import job + +from ..ops.implnet_ops_ecrr_examples import reload_ecrr_examples + +@job +def job_ecrr_examples(): + reload_ecrr_examples() diff --git a/dagster/implnets/workflows/ecrr/ecrr/jobs/implnet_jobs_ecrr_submitted.py b/dagster/implnets/workflows/ecrr/ecrr/jobs/implnet_jobs_ecrr_submitted.py new file mode 100644 index 00000000..8a099aee --- /dev/null +++ b/dagster/implnets/workflows/ecrr/ecrr/jobs/implnet_jobs_ecrr_submitted.py @@ -0,0 +1,7 @@ +from dagster import job + +from ..ops.implnet_ops_ecrr_submitted import reload_ecrr_submitted + +@job +def job_ecrr_submitted(): + reload_ecrr_submitted() diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_ecrr_examples.py b/dagster/implnets/workflows/ecrr/ecrr/ops/implnet_ops_ecrr_examples.py similarity index 95% rename from dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_ecrr_examples.py rename to dagster/implnets/workflows/ecrr/ecrr/ops/implnet_ops_ecrr_examples.py index c79c52c5..7ca4f370 100644 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_ecrr_examples.py +++ b/dagster/implnets/workflows/ecrr/ecrr/ops/implnet_ops_ecrr_examples.py @@ -1,4 +1,4 @@ -import distutils +from distutils import util import logging import time @@ -46,16 +46,16 @@ # Vars and Envs GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") # env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') +URL = os.environ.get('GLEANERIO_DOCKER_URL') +APIKEY = os.environ.get('GLEANERIO_PORTAINER_APIKEY') GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) +GLEANER_MINIO_USE_SSL = bool(util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) +GLEANER_MINIO_BUCKET =str( os.environ.get('ECRR_MINIO_BUCKET')) # set for the earhtcube utiltiies MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL @@ -67,7 +67,7 @@ GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) # using GLEANER, even though this is a nabu property... same prefix seems easier GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) +GLEANER_GRAPH_NAMESPACE = str(os.environ.get('ECRR_GRAPH_NAMESPACE')) GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) @@ -77,8 +77,8 @@ GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) +GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_DOCKER_GLEANER_CONFIG', 'gleaner')) +GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_DOCKER_NABU_CONFIG', 'nabu')) #GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) @@ -784,32 +784,13 @@ def ecrr_examples_upload_summarize(context): # r = str('returned value:{}'.format(returned_value)) # return msg + r @graph -def harvest_ecrr_examples(): +def reload_ecrr_examples(): containers = ecrr_examples_getImage() harvest = ecrr_examples_gleaner(start=containers) - -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = ecrr_examples_missingreport_s3(start=harvest) - report_idstat = ecrr_examples_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = ecrr_examples_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="ecrr_examples") load_release = ecrr_examples_naburelease(start=harvest) load_uploadrelease = ecrr_examples_uploadrelease(start=load_release) + # report_graph = ecrr_examples_graph_reports(start=load_uploadrelease) - load_prune = ecrr_examples_nabu_prune(start=load_uploadrelease) - load_prov = ecrr_examples_nabuprov(start=load_prune) - load_org = ecrr_examples_nabuorg(start=load_prov) - - summarize = ecrr_examples_summarize(start=load_uploadrelease) - upload_summarize = ecrr_examples_upload_summarize(start=summarize) - -# run after load - report_msgraph = ecrr_examples_missingreport_graph(start=summarize) - report_graph = ecrr_examples_graph_reports(start=report_msgraph) diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_edi.py b/dagster/implnets/workflows/ecrr/ecrr/ops/implnet_ops_ecrr_submitted.py similarity index 90% rename from dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_edi.py rename to dagster/implnets/workflows/ecrr/ecrr/ops/implnet_ops_ecrr_submitted.py index 4f25ac57..a36c6ee9 100644 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_edi.py +++ b/dagster/implnets/workflows/ecrr/ecrr/ops/implnet_ops_ecrr_submitted.py @@ -1,4 +1,4 @@ -import distutils +from distutils import util import logging import time @@ -36,26 +36,26 @@ from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') +DEBUG=(os.getenv('DEBUG_CONTAINER', 'False').lower() == 'true') # # # path to gleaner config in Dagster-daemon is "/scheduler/gleanerconfig.yaml" (config file mounted) # WHEN RUNNING dagster-dev, this needs to be a path to a local file ## -DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") +DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('GLEANERIO_DAGSTER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") # Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") +GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_DOCKER_HEADLESS_NETWORK', "headless_gleanerio") # env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') +URL = os.environ.get('GLEANERIO_DOCKER_URL') +APIKEY = os.environ.get('GLEANERIO_PORTAINER_APIKEY') GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) -GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) +GLEANER_MINIO_USE_SSL = bool(util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) -GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) +GLEANER_MINIO_BUCKET =str( os.environ.get('ECRR_MINIO_BUCKET')) # set for the earhtcube utiltiies MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL @@ -67,7 +67,7 @@ GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) # using GLEANER, even though this is a nabu property... same prefix seems easier GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) -GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) +GLEANER_GRAPH_NAMESPACE = str(os.environ.get('ECRR_GRAPH_NAMESPACE')) GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) @@ -77,10 +77,10 @@ GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) -GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) -GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) +GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_DOCKER_GLEANER_CONFIG', 'gleaner')) +GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_DOCKER_NABU_CONFIG', 'nabu')) #GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') -GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) +GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_GRAPH_SUMMARY_ENDPOINT',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) SUMMARY_PATH = 'graphs/summary' RELEASE_PATH = 'graphs/latest' @@ -206,7 +206,7 @@ def post_to_graph(source, path=RELEASE_PATH, extension="nq", graphendpoint=_grap get_dagster_logger().info(f'graph: insert "{source}" to {url} ') loadfrom = {'update': f'LOAD <{release_url}>'} headers = { - 'Content-Type': 'application/x-www-form-urlencoded' + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8' } r = requests.post(url, headers=headers, data=loadfrom ) log.debug(f' status:{r.status_code}') # status:404 @@ -587,7 +587,7 @@ def gleanerio(context, mode, source): return returnCode @op -def edi_getImage(context): +def ecrr_submitted_getImage(context): run_container_context = DockerContainerContext.create_for_run( context.dagster_run, context.instance.run_launcher @@ -599,54 +599,54 @@ def edi_getImage(context): client.images.pull(GLEANERIO_GLEANER_IMAGE) client.images.pull(GLEANERIO_NABU_IMAGE) @op(ins={"start": In(Nothing)}) -def edi_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "edi") +def ecrr_submitted_gleaner(context): + returned_value = gleanerio(context, ("gleaner"), "ecrr_submitted") r = str('returned value:{}'.format(returned_value)) get_dagster_logger().info(f"Gleaner returned {r} ") return @op(ins={"start": In(Nothing)}) -def edi_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "edi") +def ecrr_submitted_nabu_prune(context): + returned_value = gleanerio(context,("prune"), "ecrr_submitted") r = str('returned value:{}'.format(returned_value)) get_dagster_logger().info(f"nabu prune returned {r} ") return @op(ins={"start": In(Nothing)}) -def edi_nabuprov(context): - returned_value = gleanerio(context,("prov"), "edi") +def ecrr_submitted_nabuprov(context): + returned_value = gleanerio(context,("prov"), "ecrr_submitted") r = str('returned value:{}'.format(returned_value)) get_dagster_logger().info(f"nabu prov returned {r} ") return @op(ins={"start": In(Nothing)}) -def edi_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "edi") +def ecrr_submitted_nabuorg(context): + returned_value = gleanerio(context,("orgs"), "ecrr_submitted") r = str('returned value:{}'.format(returned_value)) get_dagster_logger().info(f"nabu org load returned {r} ") return @op(ins={"start": In(Nothing)}) -def edi_naburelease(context): - returned_value = gleanerio(context,("release"), "edi") +def ecrr_submitted_naburelease(context): + returned_value = gleanerio(context,("release"), "ecrr_submitted") r = str('returned value:{}'.format(returned_value)) get_dagster_logger().info(f"nabu release returned {r} ") return @op(ins={"start": In(Nothing)}) -def edi_uploadrelease(context): - returned_value = post_to_graph("edi", extension="nq") +def ecrr_submitted_uploadrelease(context): + returned_value = post_to_graph("ecrr_submitted", extension="nq") r = str('returned value:{}'.format(returned_value)) get_dagster_logger().info(f"upload release returned {r} ") return @op(ins={"start": In(Nothing)}) -def edi_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="edi") +def ecrr_submitted_missingreport_s3(context): + source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ecrr_submitted") source_url = source.get('url') s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) bucket = GLEANER_MINIO_BUCKET - source_name = "edi" + source_name = "ecrr_submitted" graphendpoint = None milled = False summon = True @@ -657,12 +657,12 @@ def edi_missingreport_s3(context): get_dagster_logger().info(f"missing s3 report returned {r} ") return @op(ins={"start": In(Nothing)}) -def edi_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="edi") +def ecrr_submitted_missingreport_graph(context): + source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ecrr_submitted") source_url = source.get('url') s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) bucket = GLEANER_MINIO_BUCKET - source_name = "edi" + source_name = "ecrr_submitted" graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" @@ -676,12 +676,12 @@ def edi_missingreport_graph(context): get_dagster_logger().info(f"missing graph report returned {r} ") return @op(ins={"start": In(Nothing)}) -def edi_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="edi") +def ecrr_submitted_graph_reports(context) : + source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ecrr_submitted") #source_url = source.get('url') s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) bucket = GLEANER_MINIO_BUCKET - source_name = "edi" + source_name = "ecrr_submitted" graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" @@ -696,11 +696,11 @@ def edi_graph_reports(context) : return @op(ins={"start": In(Nothing)}) -def edi_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="edi") +def ecrr_submitted_identifier_stats(context): + source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="ecrr_submitted") s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) bucket = GLEANER_MINIO_BUCKET - source_name = "edi" + source_name = "ecrr_submitted" returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) r = str('returned value:{}'.format(returned_value)) @@ -711,10 +711,10 @@ def edi_identifier_stats(context): return @op(ins={"start": In(Nothing)}) -def edi_bucket_urls(context): +def ecrr_submitted_bucket_urls(context): s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) bucket = GLEANER_MINIO_BUCKET - source_name = "edi" + source_name = "ecrr_submitted" res = s3Minio.listSummonedUrls(bucket, source_name) r = str('returned value:{}'.format(res)) @@ -728,10 +728,10 @@ class S3ObjectInfo: object_name="" @op(ins={"start": In(Nothing)}) -def edi_summarize(context) : +def ecrr_submitted_summarize(context) : s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) bucket = GLEANER_MINIO_BUCKET - source_name = "edi" + source_name = "ecrr_submitted" endpoint = _graphEndpoint() # getting data, not uploading data summary_namespace = _graphSummaryEndpoint() @@ -762,20 +762,20 @@ def edi_summarize(context) : return @op(ins={"start": In(Nothing)}) -def edi_upload_summarize(context): - returned_value = post_to_graph("edi",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) +def ecrr_submitted_upload_summarize(context): + returned_value = post_to_graph("ecrr_submitted",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) r = str('returned value:{}'.format(returned_value)) get_dagster_logger().info(f"upload summary returned {r} ") return #Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="edi"): +# def missingreport_s3(context, msg: str, source="ecrr_submitted"): # # source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) # source_url = source.get('url') # s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) # bucket = GLEANER_MINIO_BUCKET -# source_name="edi" +# source_name="ecrr_submitted" # # graphendpoint = None # milled = False @@ -784,32 +784,14 @@ def edi_upload_summarize(context): # r = str('returned value:{}'.format(returned_value)) # return msg + r @graph -def harvest_edi(): - containers = edi_getImage() - harvest = edi_gleaner(start=containers) +def reload_ecrr_submitted(): + containers = ecrr_submitted_getImage() + load_release = ecrr_submitted_naburelease(start=containers) + load_uploadrelease = ecrr_submitted_uploadrelease(start=load_release) + #report_graph = ecrr_submitted_graph_reports(start=load_uploadrelease) + # harvest = ecrr_submitted_gleaner(start=containers) -# defingin nothing dependencies - # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - - report_ms3 = edi_missingreport_s3(start=harvest) - report_idstat = edi_identifier_stats(start=report_ms3) - # for some reason, this causes a msg parameter missing - report_bucketurl = edi_bucket_urls(start=report_idstat) - - #report1 = missingreport_s3(harvest, source="edi") - load_release = edi_naburelease(start=harvest) - load_uploadrelease = edi_uploadrelease(start=load_release) - - load_prune = edi_nabu_prune(start=load_uploadrelease) - load_prov = edi_nabuprov(start=load_prune) - load_org = edi_nabuorg(start=load_prov) - - summarize = edi_summarize(start=load_uploadrelease) - upload_summarize = edi_upload_summarize(start=summarize) - -# run after load - report_msgraph = edi_missingreport_graph(start=summarize) - report_graph = edi_graph_reports(start=report_msgraph) +# diff --git a/dagster/implnets/workflows/ecrr/ecrr/sch/implnet_sch_ecrr_examples.py b/dagster/implnets/workflows/ecrr/ecrr/sch/implnet_sch_ecrr_examples.py new file mode 100644 index 00000000..8f56fc9e --- /dev/null +++ b/dagster/implnets/workflows/ecrr/ecrr/sch/implnet_sch_ecrr_examples.py @@ -0,0 +1,8 @@ +from dagster import schedule + +from ..jobs.implnet_jobs_ecrr_examples import job_ecrr_examples + +@schedule(cron_schedule="0 16 5 * *", job=job_ecrr_examples, execution_timezone="US/Central") +def implnet_sch_ecrr_examples(_context): + run_config = {} + return run_config diff --git a/dagster/implnets/workflows/ecrr/ecrr/sch/implnet_sch_ecrr_submitted.py b/dagster/implnets/workflows/ecrr/ecrr/sch/implnet_sch_ecrr_submitted.py new file mode 100644 index 00000000..5b2ed643 --- /dev/null +++ b/dagster/implnets/workflows/ecrr/ecrr/sch/implnet_sch_ecrr_submitted.py @@ -0,0 +1,8 @@ +from dagster import schedule + +from ..jobs.implnet_jobs_ecrr_submitted import job_ecrr_submitted + +@schedule(cron_schedule="0 8 3 * *", job=job_ecrr_submitted, execution_timezone="US/Central") +def implnet_sch_ecrr_submitted(_context): + run_config = {} + return run_config diff --git a/dagster/implnets/generatedCode/implnet-eco/output/workspace.yaml b/dagster/implnets/workflows/ecrr/ecrr/workspace.yaml similarity index 100% rename from dagster/implnets/generatedCode/implnet-eco/output/workspace.yaml rename to dagster/implnets/workflows/ecrr/ecrr/workspace.yaml diff --git a/dagster/implnets/workflows/ecrr/jobs/implnet_jobs_ecrr_examples.py b/dagster/implnets/workflows/ecrr/jobs/implnet_jobs_ecrr_examples.py deleted file mode 100644 index cb51bb8d..00000000 --- a/dagster/implnets/workflows/ecrr/jobs/implnet_jobs_ecrr_examples.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_ecrr_examples import harvest_ecrr_examples - -@job -def implnet_job_ecrr_examples(): - harvest_ecrr_examples() \ No newline at end of file diff --git a/dagster/implnets/workflows/ecrr/jobs/implnet_jobs_ecrr_submitted.py b/dagster/implnets/workflows/ecrr/jobs/implnet_jobs_ecrr_submitted.py deleted file mode 100644 index 715b4592..00000000 --- a/dagster/implnets/workflows/ecrr/jobs/implnet_jobs_ecrr_submitted.py +++ /dev/null @@ -1,7 +0,0 @@ -from dagster import job - -from ops.implnet_ops_ecrr_submitted import harvest_ecrr_submitted - -@job -def implnet_job_ecrr_submitted(): - harvest_ecrr_submitted() \ No newline at end of file diff --git a/dagster/implnets/workflows/ecrr/ops/implnet_ops_ecrr_examples.py b/dagster/implnets/workflows/ecrr/ops/implnet_ops_ecrr_examples.py deleted file mode 100644 index 89a9f681..00000000 --- a/dagster/implnets/workflows/ecrr/ops/implnet_ops_ecrr_examples.py +++ /dev/null @@ -1,673 +0,0 @@ -import distutils - -from dagster import job, op, graph, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError -from dagster import job, op, get_dagster_logger -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANER_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANER_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = os.environ.get('GLEANER_MINIO_ADDRESS') -GLEANER_MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -GLEANER_MINIO_USE_SSL = os.environ.get('GLEANER_MINIO_USE_SSL') -GLEANER_MINIO_SECRET_KEY = os.environ.get('GLEANER_MINIO_SECRET_KEY') -GLEANER_MINIO_ACCESS_KEY = os.environ.get('GLEANER_MINIO_ACCESS_KEY') -GLEANER_MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') -GLEANER_HEADLESS_ENDPOINT = os.environ.get('GLEANER_HEADLESS_ENDPOINT', "http://headless:9222") -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = os.environ.get('GLEANER_GRAPH_URL') -GLEANER_GRAPH_NAMESPACE = os.environ.get('GLEANER_GRAPH_NAMESPACE') -GLEANERIO_GLEANER_CONFIG_PATH= os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml") -GLEANERIO_NABU_CONFIG_PATH= os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml") - -def _graphEndpoint(): - url = f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(os.environ.get('GLEANER_MINIO_ADDRESS')) + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_ADDRESS'))}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(GLEANER_MINIO_BUCKET)}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_USE_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_ACCESS_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET_KEY'), - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_USE_SSL'))) - if (os.environ.get('GLEANER_MINIO_PORT') and os.environ.get('GLEANER_MINIO_PORT') == 80 - and secure == False): - server = _pythonMinioUrl(os.environ.get('GLEANER_MINIO_ADDRESS')) - elif (os.environ.get('GLEANER_MINIO_PORT') and os.environ.get('GLEANER_MINIO_PORT') == 443 - and secure == True): - server = _pythonMinioUrl(os.environ.get('GLEANER_MINIO_ADDRESS')) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(os.environ.get('GLEANER_MINIO_ADDRESS'))}:{os.environ.get('GLEANER_MINIO_PORT')}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_ACCESS_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET_KEY'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if os.environ.get('GLEANER_MINIO_USE_SSL'): - proto = "https" - port = os.environ.get('GLEANER_MINIO_PORT') - address = os.environ.get('GLEANER_MINIO_ADDRESS') - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"creat docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _get_container_name(run_id, op_name, retry_number): - container_name = hash_str(run_id + op_name) - - retry_number = retry_number - if retry_number > 0: - container_name = f"{container_name}-{retry_number}" - - return container_name - - -def _create_container( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="" -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"creat docker container") - return client.containers.create( - image, - name=name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - detach=True, - network=container_context.networks[0] if len(container_context.networks) else None, - # entrypoint=entrypoint, - command=command, - environment=env_vars, - **container_context.container_kwargs, - ) - -def gleanerio(context, mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - # CMD = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - CMD = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"gleaner01_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"nabu01_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"nabu01_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"nabu01_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"nabu01_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = CMD -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - "volumes": { - f"{GLEANER_CONFIG_VOLUME}": - {'bind': '/configs', 'mode': 'rw'} - }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_container: ") - container = _create_container( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME - ) - except docker.errors.ImageNotFound: - client.images.pull(IMAGE) - container = _create_container( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME - ) - - if len(container_context.networks) > 1: - for network_name in container_context.networks[1:]: - network = client.networks.get(network_name) - network.connect(container) - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - DATA = s3reader(ARCHIVE_FILE) - container.put_archive(ARCHIVE_PATH,DATA ) - - - ## ------------ Start - ## note new issue: - # {"message": "starting container with non-empty request body was deprecated since API v1.22 and removed in v1.24"} - EMPTY_DATA="{}".encode('utf-8') - url = URL + 'containers/' + cid + '/start' - get_dagster_logger().info(f"Container start url: {url}") - req = request.Request(url,data=EMPTY_DATA, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - try: - r = request.urlopen(req) - except HTTPError as err: - get_dagster_logger().fatal(f"Container Start failed: {str(err.code)} reason: {err.reason}") - raise err - except Exception as err: - print("failed to start container: unknown reason: ", err) - get_dagster_logger().info(f"Create Failed: unknown reason {str(err)}") - raise err - print(r.status) - get_dagster_logger().info(f"Start container: {str(r.status)}") - - # container.start() - # client.api.start(container=container.id) - ## start is not working - - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - - # ## ------------ Wait expect 200 - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=True).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: {str(r.status)}") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{str(mode)}_runlogs") - - # - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - if (cid): - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Container Remove: {str(r.status)}") - else: - get_dagster_logger().info(f"Container Not created, so not removed.") - else: - get_dagster_logger().info(f"Container NOT Remove: DEBUG ENABLED") - - - return 0 - -@op -def ecrr_examples_gleaner(context)-> str: - returned_value = gleanerio(context, ("gleaner"), "ecrr_examples") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def ecrr_examples_nabu_prune(context, msg: str)-> str: - returned_value = gleanerio(context,("nabu"), "ecrr_examples") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ecrr_examples_nabuprov(context, msg: str)-> str: - returned_value = gleanerio(context,("prov"), "ecrr_examples") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ecrr_examples_nabuorg(context, msg: str)-> str: - returned_value = gleanerio(context,("orgs"), "ecrr_examples") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ecrr_examples_naburelease(context, msg: str) -> str: - returned_value = gleanerio(context,("release"), "ecrr_examples") - r = str('returned value:{}'.format(returned_value)) - return msg + r -@op -def ecrr_examples_uploadrelease(context, msg: str) -> str: - returned_value = postRelease("ecrr_examples") - r = str('returned value:{}'.format(returned_value)) - return msg + r - - -@op -def ecrr_examples_missingreport_s3(context, msg: str) -> str: - source = getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename="ecrr_examples") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "ecrr_examples" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - return msg + r -@op -def ecrr_examples_missingreport_graph(context, msg: str) -> str: - source = getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename="ecrr_examples") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "ecrr_examples" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - - return msg + r -@op -def ecrr_examples_graph_reports(context, msg: str) -> str: - source = getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename="ecrr_examples") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "ecrr_examples" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - - return msg + r - -@op -def ecrr_examples_identifier_stats(context, msg: str) -> str: - source = getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename="ecrr_examples") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "ecrr_examples" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - return msg + r - -@op() -def ecrr_examples_bucket_urls(context, msg: str) -> str: - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "ecrr_examples" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - return msg + r - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="ecrr_examples"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="ecrr_examples" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_ecrr_examples(): - pass -# # harvest = ecrr_examples_gleaner() -# -# # report_ms3 = ecrr_examples_missingreport_s3(harvest) -# report_idstat = ecrr_examples_identifier_stats(report_ms3) -# # for some reason, this causes a msg parameter missing -# report_bucketurl = ecrr_examples_bucket_urls(report_idstat) -# -# #report1 = missingreport_s3(harvest, source="ecrr_examples") -# load_release = ecrr_examples_naburelease(harvest) -# load_uploadrelease = ecrr_examples_uploadrelease(load_release) -# -# load_prune = ecrr_examples_nabu_prune(load_uploadrelease) -# load_prov = ecrr_examples_nabuprov(load_prune) -# load_org = ecrr_examples_nabuorg(load_prov) -# -# # run after load -# report_msgraph=ecrr_examples_missingreport_graph(load_org) -# report_graph=ecrr_examples_graph_reports(report_msgraph) - - - - diff --git a/dagster/implnets/workflows/ecrr/ops/implnet_ops_ecrr_submitted.py b/dagster/implnets/workflows/ecrr/ops/implnet_ops_ecrr_submitted.py deleted file mode 100644 index b286817b..00000000 --- a/dagster/implnets/workflows/ecrr/ops/implnet_ops_ecrr_submitted.py +++ /dev/null @@ -1,672 +0,0 @@ -import distutils - -from dagster import job, op, graph, get_dagster_logger -import os, json, io -import urllib -from urllib import request -from urllib.error import HTTPError -from dagster import job, op, get_dagster_logger -from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner -import json - -from minio import Minio -from minio.error import S3Error -from datetime import datetime -from ec.reporting.report import missingReport, generateGraphReportsRepo, reportTypes, generateIdentifierRepo -from ec.datastore import s3 -import requests -import logging as log -from urllib.error import HTTPError - -from typing import Any, Mapping, Optional, Sequence - -import docker -from dagster import Field, In, Nothing, OpExecutionContext, StringSource, op -from dagster._annotations import experimental -from dagster._core.utils import parse_env_var -from dagster._serdes.utils import hash_str - -from dagster_docker.container_context import DockerContainerContext -from dagster_docker.docker_run_launcher import DockerRunLauncher -from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image - -DEBUG=(os.getenv('DEBUG', 'False').lower() == 'true') -# volume and netowrk need to be the names in docker, and not the names of the object in docker compose -GLEANER_CONFIG_VOLUME=os.environ.get('GLEANER_CONFIG_VOLUME', "dagster_gleaner_configs") -# Vars and Envs -GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANER_HEADLESS_NETWORK', "headless_gleanerio") -# env items -URL = os.environ.get('PORTAINER_URL') -APIKEY = os.environ.get('PORTAINER_KEY') - - -GLEANER_MINIO_ADDRESS = os.environ.get('GLEANER_MINIO_ADDRESS') -GLEANER_MINIO_PORT = os.environ.get('GLEANER_MINIO_PORT') -GLEANER_MINIO_USE_SSL = os.environ.get('GLEANER_MINIO_USE_SSL') -GLEANER_MINIO_SECRET_KEY = os.environ.get('GLEANER_MINIO_SECRET_KEY') -GLEANER_MINIO_ACCESS_KEY = os.environ.get('GLEANER_MINIO_ACCESS_KEY') -GLEANER_MINIO_BUCKET = os.environ.get('GLEANER_MINIO_BUCKET') -GLEANER_HEADLESS_ENDPOINT = os.environ.get('GLEANER_HEADLESS_ENDPOINT', "http://headless:9222") -# using GLEANER, even though this is a nabu property... same prefix seems easier -GLEANER_GRAPH_URL = os.environ.get('GLEANER_GRAPH_URL') -GLEANER_GRAPH_NAMESPACE = os.environ.get('GLEANER_GRAPH_NAMESPACE') -GLEANERIO_GLEANER_CONFIG_PATH= os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml") -GLEANERIO_NABU_CONFIG_PATH= os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml") - -def _graphEndpoint(): - url = f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - return url - -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL -def read_file_bytestream(image_path): - data = open(image_path, 'rb').read() - return data - - -def load_data(file_or_url): - try: - with urllib.request.urlopen(file_or_url) as f: - data = f.read() - except ValueError: - with open(file_or_url, 'rb') as f: - data = f.read() - return data - - -def s3reader(object): - server = _pythonMinioUrl(os.environ.get('GLEANER_MINIO_ADDRESS')) + ":" + os.environ.get('GLEANER_MINIO_PORT') - get_dagster_logger().info(f"S3 URL : {str(os.environ.get('GLEANER_MINIO_ADDRESS'))}") - get_dagster_logger().info(f"S3 PYTHON SERVER : {server}") - get_dagster_logger().info(f"S3 PORT : {str(os.environ.get('GLEANER_MINIO_PORT'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_KEY'))}") - # get_dagster_logger().info(f"S3 read started : {str(os.environ.get('GLEANER_MINIO_SECRET'))}") - get_dagster_logger().info(f"S3 BUCKET : {str(GLEANER_MINIO_BUCKET)}") - get_dagster_logger().info(f"S3 object : {str(object)}") - - client = Minio( - server, - # secure=True, - secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_USE_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_ACCESS_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET_KEY'), - ) - try: - data = client.get_object(GLEANER_MINIO_BUCKET, object) - return data - except S3Error as err: - get_dagster_logger().info(f"S3 read error : {str(err)}") - - -def s3loader(data, name): - secure= bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_USE_SSL'))) - if (os.environ.get('GLEANER_MINIO_PORT') and os.environ.get('GLEANER_MINIO_PORT') == 80 - and secure == False): - server = _pythonMinioUrl(os.environ.get('GLEANER_MINIO_ADDRESS')) - elif (os.environ.get('GLEANER_MINIO_PORT') and os.environ.get('GLEANER_MINIO_PORT') == 443 - and secure == True): - server = _pythonMinioUrl(os.environ.get('GLEANER_MINIO_ADDRESS')) - else: - # it's not on a normal port - server = f"{_pythonMinioUrl(os.environ.get('GLEANER_MINIO_ADDRESS'))}:{os.environ.get('GLEANER_MINIO_PORT')}" - - client = Minio( - server, - secure=secure, - #secure = bool(distutils.util.strtobool(os.environ.get('GLEANER_MINIO_SSL'))), - access_key=os.environ.get('GLEANER_MINIO_ACCESS_KEY'), - secret_key=os.environ.get('GLEANER_MINIO_SECRET_KEY'), - ) - - # Make 'X' bucket if not exist. - # found = client.bucket_exists("X") - # if not found: - # client.make_bucket("X") - # else: - # print("Bucket 'X' already exists") - - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") - - logname = name + '_{}.log'.format(date_string) - objPrefix = os.environ.get('GLEANERIO_LOG_PREFIX') + logname - f = io.BytesIO() - #length = f.write(bytes(json_str, 'utf-8')) - length = f.write(data) - f.seek(0) - client.put_object(GLEANER_MINIO_BUCKET, - objPrefix, - f, #io.BytesIO(data), - length, #len(data), - content_type="text/plain" - ) - get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") -def postRelease(source): - # revision of EC utilities, will have a insertFromURL - #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) - proto = "http" - - if os.environ.get('GLEANER_MINIO_USE_SSL'): - proto = "https" - port = os.environ.get('GLEANER_MINIO_PORT') - address = os.environ.get('GLEANER_MINIO_ADDRESS') - bucket = GLEANER_MINIO_BUCKET - path = "graphs/latest" - release_url = f"{proto}://{address}:{port}/{bucket}/{path}/{source}_release.nq" - url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" - get_dagster_logger().info(f'graph: insert "{source}" to {url} ') - r = requests.post(url) - log.debug(f' status:{r.status_code}') # status:404 - get_dagster_logger().info(f'graph: insert: status:{r.status_code}') - if r.status_code == 200: - # '' - if 'data modified="0"' in r.text: - get_dagster_logger().info(f'graph: no data inserted ') - raise Exception("No Data Added: " + r.text) - return True - else: - get_dagster_logger().info(f'graph: error') - raise Exception(f' graph: insert failed: status:{r.status_code}') - -def _get_client(docker_container_context: DockerContainerContext): - headers = {'X-API-Key': APIKEY} - client = docker.DockerClient(base_url=URL, version="1.43" ) - #client = docker.APIClient(base_url=URL, version="1.35") - get_dagster_logger().info(f"creat docker client") - if (client.api._general_configs): - client.api._general_configs["HttpHeaders"] = headers - else: - client.api._general_configs={"HttpHeaders":headers} - client.api.headers['X-API-Key'] = APIKEY - get_dagster_logger().info(f" docker version {client.version()}") - if docker_container_context.registry: - client.login( - registry=docker_container_context.registry["url"], - username=docker_container_context.registry["username"], - password=docker_container_context.registry["password"], - ) - return client - - -def _get_container_name(run_id, op_name, retry_number): - container_name = hash_str(run_id + op_name) - - retry_number = retry_number - if retry_number > 0: - container_name = f"{container_name}-{retry_number}" - - return container_name - - -def _create_container( - op_context: OpExecutionContext, - client, - container_context: DockerContainerContext, - image: str, - entrypoint: Optional[Sequence[str]], - command: Optional[Sequence[str]], - name="" -): - env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) - get_dagster_logger().info(f"creat docker container") - return client.containers.create( - image, - name=name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), - detach=True, - network=container_context.networks[0] if len(container_context.networks) else None, - # entrypoint=entrypoint, - command=command, - environment=env_vars, - **container_context.container_kwargs, - ) - -def gleanerio(context, mode, source): - ## ------------ Create - - get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - - if str(mode) == "gleaner": - IMAGE = os.environ.get('GLEANERIO_GLEANER_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH') - # CMD = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" - CMD = ["--cfg", GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] - NAME = f"gleaner01_{source}_{str(mode)}" - WorkingDir = "/gleaner/" - #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] - # LOGFILE = 'log_gleaner.txt' # only used for local log file writing - elif (str(mode) == "nabu"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] - NAME = f"nabu01_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "prov"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] - NAME = f"nabu01_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "orgs"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] - NAME = f"nabu01_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - elif (str(mode) == "release"): - IMAGE = os.environ.get('GLEANERIO_NABU_IMAGE') - ARCHIVE_FILE = os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT') - ARCHIVE_PATH = os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH') - CMD = ["--cfg", GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] - NAME = f"nabu01_{source}_{str(mode)}" - WorkingDir = "/nabu/" - Entrypoint = "nabu" - # LOGFILE = 'log_nabu.txt' # only used for local log file writing - else: - return 1 - - # from docker0dagster - run_container_context = DockerContainerContext.create_for_run( - context.dagster_run, - context.instance.run_launcher - if isinstance(context.instance.run_launcher, DockerRunLauncher) - else None, - ) - validate_docker_image(IMAGE) - - try: - # setup data/body for container create - data = {} - data["Image"] = IMAGE - data["WorkingDir"] = WorkingDir - #data["Entrypoint"] = Entrypoint - data["Cmd"] = CMD -#### gleaner - # v.BindEnv("minio.address", "MINIO_ADDRESS") - # v.BindEnv("minio.port", "MINIO_PORT") - # v.BindEnv("minio.ssl", "MINIO_USE_SSL") - # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") - # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") - # v.BindEnv("minio.bucket", "MINIO_BUCKET") - # // v.BindEnv("minio.region", "MINIO_REGION") - # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") - # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") - # v.BindEnv("sparql.username", "SPARQL_USERNAME") - # v.BindEnv("sparql.password", "SPARQL_PASSWORD") - # v.BindEnv("s3.domain", "S3_DOMAIN") -### gleaner summoner config - # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") - # viperSubtree.BindEnv("threads", "GLEANER_THREADS") - # viperSubtree.BindEnv("mode", "GLEANER_MODE") - - #### NABU config - # minioSubtress.BindEnv("address", "MINIO_ADDRESS") - # minioSubtress.BindEnv("port", "MINIO_PORT") - # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") - # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") - # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") - ###### nabu sparql config - # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") - # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") - # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") - # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") - # viperSubtree.BindEnv("username", "SPARQL_USERNAME") - # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") - ### NABU object - # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") - # viperSubtree.BindEnv("domain", "S3_DOMAIN") - # add in env variables here"Env": ["FOO=bar","BAZ=quux"], - - # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE - enva = [] - enva.append(str("MINIO_ADDRESS={}".format(GLEANER_MINIO_ADDRESS))) - enva.append(str("MINIO_PORT={}".format(GLEANER_MINIO_PORT))) - enva.append(str("MINIO_USE_SSL={}".format(GLEANER_MINIO_USE_SSL))) - enva.append(str("MINIO_SECRET_KEY={}".format(GLEANER_MINIO_SECRET_KEY))) - enva.append(str("MINIO_ACCESS_KEY={}".format(GLEANER_MINIO_ACCESS_KEY))) - enva.append(str("MINIO_BUCKET={}".format(GLEANER_MINIO_BUCKET))) - enva.append(str("SPARQL_ENDPOINT={}".format(_graphEndpoint()))) - enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(GLEANER_HEADLESS_ENDPOINT))) - enva.append(str("GLEANER_HEADLESS_NETWORK={}".format(GLEANER_HEADLESS_NETWORK))) - - data["Env"] = enva - data["HostConfig"] = { - "NetworkMode": GLEANER_HEADLESS_NETWORK, - "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] - } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data - -# docker dagster - get_dagster_logger().info(f"start docker code region: ") - # trying to get headers in: - # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 - op_container_context = DockerContainerContext( - # registry=registry, - env_vars=enva, - networks=[GLEANER_HEADLESS_NETWORK], - container_kwargs={"working_dir": data["WorkingDir"], - "volumes": { - f"{GLEANER_CONFIG_VOLUME}": - {'bind': '/configs', 'mode': 'rw'} - }, - - - }, - ) - container_context = run_container_context.merge(op_container_context) - get_dagster_logger().info(f"call docker _get_client: ") - client = _get_client(container_context) - - try: - get_dagster_logger().info(f"try docker _create_container: ") - container = _create_container( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME - ) - except docker.errors.ImageNotFound: - client.images.pull(IMAGE) - container = _create_container( - context, client, container_context, IMAGE, "", data["Cmd"], name=NAME - ) - - if len(container_context.networks) > 1: - for network_name in container_context.networks[1:]: - network = client.networks.get(network_name) - network.connect(container) - - cid = container.id # legacy til the start get's fixed - - - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - DATA = s3reader(ARCHIVE_FILE) - container.put_archive(ARCHIVE_PATH,DATA ) - - - ## ------------ Start - ## note new issue: - # {"message": "starting container with non-empty request body was deprecated since API v1.22 and removed in v1.24"} - EMPTY_DATA="{}".encode('utf-8') - url = URL + 'containers/' + cid + '/start' - get_dagster_logger().info(f"Container start url: {url}") - req = request.Request(url,data=EMPTY_DATA, method="POST") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - try: - r = request.urlopen(req) - except HTTPError as err: - get_dagster_logger().fatal(f"Container Start failed: {str(err.code)} reason: {err.reason}") - raise err - except Exception as err: - print("failed to start container: unknown reason: ", err) - get_dagster_logger().info(f"Create Failed: unknown reason {str(err)}") - raise err - print(r.status) - get_dagster_logger().info(f"Start container: {str(r.status)}") - - # container.start() - # client.api.start(container=container.id) - ## start is not working - - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - - # ## ------------ Wait expect 200 - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=True).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: {str(r.status)}") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload - # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) - # pw_tar.extractall("extract_to/") - - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{str(mode)}_runlogs") - - # - if exit_status != 0: - raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") - finally: - if (not DEBUG) : - if (cid): - url = URL + 'containers/' + cid - req = request.Request(url, method="DELETE") - req.add_header('X-API-Key', APIKEY) - # req.add_header('content-type', 'application/json') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - print(r.status) - get_dagster_logger().info(f"Container Remove: {str(r.status)}") - else: - get_dagster_logger().info(f"Container Not created, so not removed.") - else: - get_dagster_logger().info(f"Container NOT Remove: DEBUG ENABLED") - - - return 0 - -@op -def ecrr_submitted_gleaner(context)-> str: - returned_value = gleanerio(context, ("gleaner"), "ecrr_submitted") - r = str('returned value:{}'.format(returned_value)) - get_dagster_logger().info(f"Gleaner notes are {r} ") - return r - -@op -def ecrr_submitted_nabu_prune(context, msg: str)-> str: - returned_value = gleanerio(context,("nabu"), "ecrr_submitted") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ecrr_submitted_nabuprov(context, msg: str)-> str: - returned_value = gleanerio(context,("prov"), "ecrr_submitted") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ecrr_submitted_nabuorg(context, msg: str)-> str: - returned_value = gleanerio(context,("orgs"), "ecrr_submitted") - r = str('returned value:{}'.format(returned_value)) - return msg + r - -@op -def ecrr_submitted_naburelease(context) -> str: - returned_value = gleanerio(context,("release"), "ecrr_submitted") - r = str('returned value:{}'.format(returned_value)) - return r -@op -def ecrr_submitted_uploadrelease(context, msg: str) -> str: - returned_value = postRelease("ecrr_submitted") - r = str('returned value:{}'.format(returned_value)) - return msg + r - - -@op -def ecrr_submitted_missingreport_s3(context, msg: str) -> str: - source = getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename="ecrr_submitted") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "ecrr_submitted" - graphendpoint = None - milled = False - summon = True - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing repoort returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - s3Minio.putReportFile(bucket, source_name, "missing_report_s3.json", report) - return msg + r -@op -def ecrr_submitted_missingreport_graph(context, msg: str) -> str: - source = getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename="ecrr_submitted") - source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "ecrr_submitted" - - graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = True - summon = False # summon only off - returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) - r = str('missing report graph returned value:{}'.format(returned_value)) - report = json.dumps(returned_value, indent=2) - - s3Minio.putReportFile(bucket, source_name, "missing_report_graph.json", report) - - return msg + r -@op -def ecrr_submitted_graph_reports(context, msg: str) -> str: - source = getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename="ecrr_submitted") - #source_url = source.get('url') - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "ecrr_submitted" - - graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" - - milled = False - summon = True - returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) - r = str('returned value:{}'.format(returned_value)) - #report = json.dumps(returned_value, indent=2) # value already json.dumps - report = returned_value - s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) - - return msg + r - -@op -def ecrr_submitted_identifier_stats(context, msg: str) -> str: - source = getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename="ecrr_submitted") - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "ecrr_submitted" - - returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) - r = str('returned value:{}'.format(returned_value)) - #r = str('identifier stats returned value:{}'.format(returned_value)) - report = returned_value.to_json() - s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - return msg + r - -@op() -def ecrr_submitted_bucket_urls(context, msg: str) -> str: - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) - bucket = GLEANER_MINIO_BUCKET - source_name = "ecrr_submitted" - - res = s3Minio.listSummonedUrls(bucket, source_name) - r = str('returned value:{}'.format(res)) - bucketurls = json.dumps(res, indent=2) - s3Minio.putReportFile(GLEANER_MINIO_BUCKET, source_name, "bucketutil_urls.json", bucketurls) - return msg + r - - -#Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="ecrr_submitted"): -# -# source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) -# source_url = source.get('url') -# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), None) -# bucket = GLEANER_MINIO_BUCKET -# source_name="ecrr_submitted" -# -# graphendpoint = None -# milled = False -# summon = True -# returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) -# r = str('returned value:{}'.format(returned_value)) -# return msg + r -@graph -def harvest_ecrr_submitted(): -# harvest = ecrr_submitted_gleaner() - -# report_ms3 = ecrr_submitted_missingreport_s3(harvest) -# report_idstat = ecrr_submitted_identifier_stats(report_ms3) - # for some reason, this causes a msg parameter missing -# report_bucketurl = ecrr_submitted_bucket_urls(report_idstat) - - #report1 = missingreport_s3(harvest, source="ecrr_submitted") - load_release = ecrr_submitted_naburelease() - load_uploadrelease = ecrr_submitted_uploadrelease(load_release) - - load_prune = ecrr_submitted_nabu_prune(load_uploadrelease) - load_prov = ecrr_submitted_nabuprov(load_prune) - load_org = ecrr_submitted_nabuorg(load_prov) - -# run after load -# report_msgraph=ecrr_submitted_missingreport_graph(load_org) - report_graph=ecrr_submitted_graph_reports(load_release) - - - - diff --git a/dagster/implnets/workflows/ecrr/pyproject.toml b/dagster/implnets/workflows/ecrr/pyproject.toml new file mode 100644 index 00000000..f0d202f2 --- /dev/null +++ b/dagster/implnets/workflows/ecrr/pyproject.toml @@ -0,0 +1,6 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[tool.dagster] +module_name = "ecrr" diff --git a/dagster/implnets/workflows/ecrr/repositories/repository.py b/dagster/implnets/workflows/ecrr/repositories/repository.py deleted file mode 100644 index 263e13aa..00000000 --- a/dagster/implnets/workflows/ecrr/repositories/repository.py +++ /dev/null @@ -1,13 +0,0 @@ -from dagster import repository -from jobs.implnet_jobs_ecrr_submitted import implnet_job_ecrr_submitted -from sch.implnet_sch_ecrr_submitted import implnet_sch_ecrr_submitted -from jobs.implnet_jobs_ecrr_examples import implnet_job_ecrr_examples -from sch.implnet_sch_ecrr_examples import implnet_sch_ecrr_examples - -@repository(name="ecrr") -def ecrr(): - jobs = [implnet_job_ecrr_submitted, implnet_job_ecrr_examples] - schedules = [implnet_sch_ecrr_submitted, implnet_sch_ecrr_examples] - - - return jobs + schedules diff --git a/dagster/implnets/workflows/ecrr/sch/implnet_sch_ecrr_examples.py b/dagster/implnets/workflows/ecrr/sch/implnet_sch_ecrr_examples.py deleted file mode 100644 index d7bba855..00000000 --- a/dagster/implnets/workflows/ecrr/sch/implnet_sch_ecrr_examples.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_ecrr_examples import implnet_job_ecrr_examples - -@schedule(cron_schedule="0 12 * * 3", job=implnet_job_ecrr_examples, execution_timezone="US/Central") -def implnet_sch_ecrr_examples(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/workflows/ecrr/sch/implnet_sch_ecrr_submitted.py b/dagster/implnets/workflows/ecrr/sch/implnet_sch_ecrr_submitted.py deleted file mode 100644 index e6c739d9..00000000 --- a/dagster/implnets/workflows/ecrr/sch/implnet_sch_ecrr_submitted.py +++ /dev/null @@ -1,8 +0,0 @@ -from dagster import schedule - -from jobs.implnet_jobs_ecrr_submitted import implnet_job_ecrr_submitted - -@schedule(cron_schedule="0 0 * * 0", job=implnet_job_ecrr_submitted, execution_timezone="US/Central") -def implnet_sch_ecrr_submitted(_context): - run_config = {} - return run_config diff --git a/dagster/implnets/workflows/ecrr/tests/__init__.py b/dagster/implnets/workflows/ecrr/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dagster/implnets/workflows/ingest/NOTES.md b/dagster/implnets/workflows/ingest/NOTES.md new file mode 100644 index 00000000..3b9cd820 --- /dev/null +++ b/dagster/implnets/workflows/ingest/NOTES.md @@ -0,0 +1,26 @@ + +# Schedules + +It's hard to a set of dynamic Schedules, with varying crons +https://github.com/dagster-io/dagster/discussions/22121 + +Right now, all sources will run weekly + +while not ideal, I think we could setup three schedules: daily, weekly, monthly and quarterly. +Then if a the cron in the source matched, a run would occur. + +more complex would be having something that ran (hourly), +and go through the list of sources, and last runs, and if it was time to run, then run that source. +Basically put an evaluation function in before seeing if a run should occur, +if it should do add run request to list of run requests for time, then return that list. + + +How do I write a sensor or schedule that requests a run for every partition on every tick? +https://github.com/dagster-io/dagster/discussions/15532 + +partiton metadata about last run: https://github.com/dagster-io/dagster/discussions/14338 +How to ensure the previous partition of a job has succeeded before running the next partition https://github.com/dagster-io/dagster/discussions/10264 + +dynamic partitions +https://docs.dagster.io/concepts/partitions-schedules-sensors/partitioning-assets#dynamically-partitioned-assets + diff --git a/dagster/implnets/workflows/ingest/README.md b/dagster/implnets/workflows/ingest/README.md new file mode 100644 index 00000000..cea90599 --- /dev/null +++ b/dagster/implnets/workflows/ingest/README.md @@ -0,0 +1,53 @@ +# Ingest Rework + +This is an attempt to rework the ingest system, to split the summon/release file from the load to graph +and clean graph, and the reporting. + +**the model is that** +1. we read a list of sources. In long term this will be a file in an s3 bucket with just the gleanerio source information +2. for each source, we harvest (summon, create release, (optional flag: summarize, load summarize)... in the long term, this will need to create a dynamic schedule +3. generate reports and stats +4. read and create communities (from an s3 location?) + 5. all + 6. customized +5. update community sensor + 6. when a source is updated, update the community + +## gleaner io container routines +* summon : run gleaner, run nabu release + * assets -> summon path (metadata: s3:path file count, time), release file (metadata: s3path, size, time), reports +* relase +* prune +* prov +* orgs + +## ops: + * Load to graph + * summarize + * load summarize + * reports + * graph (prune, prov, orgs) + * community stats + * UI +## Sensor: +These routines are useful to all communities. + +* new release file + * run prov + * run bucket report, missing report, identifier report + * run summarize. Not needed by all communiities, but prevents duplicate op from being run. Can add a flag. + +# Sensor for a community +Have a sensor that looks at the release files, and then determines if a release needs to be pushed to a communit +if this release is a source in my community. + * load graph + * graph report + * load prov + * load summarize + * (ec) run community stats (about) + + + +This is a [Dagster](https://dagster.io/) project made to be used alongside the official [Dagster tutorial](https://docs.dagster.io/tutorial). + +Use Dagster AWS for minio configuraiton diff --git a/dagster/implnets/workflows/ingest/data/tenant.yaml b/dagster/implnets/workflows/ingest/data/tenant.yaml new file mode 100644 index 00000000..60fe2095 --- /dev/null +++ b/dagster/implnets/workflows/ingest/data/tenant.yaml @@ -0,0 +1,29 @@ +# prototype tennants file + +# prototype tennants file + +tenant: + - community: dev + hostname: geocodes-dev + description: GeoCodes is... + name: Geocodes Science on Schema + url: https://www.earthcube.org + logo: https://unsplash.com/random + graph: + main_namespace: test + summary_namespace: test_summary + sources: + - iris + - opentopography +###### + - community: geocodesall + hostname: geocodes-all + description: GeoCodes is... + name: Geocodes Science on Schema + url: https://www.earthcube.org + logo: https://unsplash.com/random + graph: + main_namespace: geocodes_test + summary_namespace: geocodes_test_summary + sources: + - all diff --git a/dagster/implnets/workflows/ingest/data/tenant_dev.yaml b/dagster/implnets/workflows/ingest/data/tenant_dev.yaml new file mode 100644 index 00000000..dee0699c --- /dev/null +++ b/dagster/implnets/workflows/ingest/data/tenant_dev.yaml @@ -0,0 +1,28 @@ +# prototype tennants file + +tenant: + - community: dev + hostname: geocodes-dev + description: GeoCodes is... + name: Geocodes Science on Schema + url: https://www.earthcube.org + logo: https://unsplash.com/random + graph: + main_namespace: test + summary_namespace: test_summary + sources: + - iris + - opentopography +###### + - community: geocodesall + hostname: geocodes-all + description: GeoCodes is... + name: Geocodes Science on Schema + url: https://www.earthcube.org + logo: https://unsplash.com/random + graph: + main_namespace: geocodes_test + summary_namespace: geocodes_test_summary + sources: + - all + diff --git a/dagster/implnets/workflows/ingest/data/tenant_prod.yaml b/dagster/implnets/workflows/ingest/data/tenant_prod.yaml new file mode 100644 index 00000000..9661df09 --- /dev/null +++ b/dagster/implnets/workflows/ingest/data/tenant_prod.yaml @@ -0,0 +1,40 @@ +# prototype tennants file + +tenant: + - community: production + hostname: geocodes-aws + description: GeoCodes is... + name: Geocodes Science on Schema + url: https://www.earthcube.org + logo: https://unsplash.com/random + graph: + main_namespace: geocodes + summary_namespace: geocodes_summary + sources: + - bcodmo + - r2r +###### + - community: geocodesall + hostname: geocodes-all + description: GeoCodes is... + name: Geocodes Science on Schema + url: https://www.earthcube.org + logo: https://unsplash.com/random + graph: + main_namespace: earthcube + summary_namespace: earthcube_summary + sources: + - all +###### + - community: deepoceans + hostname: oceans + description: this is a test 1 + name: A community description + url: https://www.earthcube.org + logo: https://unsplash.com/random + graph: + main_namespace: oceans + summary_namespace: oceans_summary + sources: + - bcodmo + - r2r diff --git a/dagster/implnets/workflows/ingest/data/tennant.yaml b/dagster/implnets/workflows/ingest/data/tennant.yaml new file mode 100644 index 00000000..08c7d57b --- /dev/null +++ b/dagster/implnets/workflows/ingest/data/tennant.yaml @@ -0,0 +1,29 @@ +# prototype tennants file + +# prototype tennants file + +tennant: + - community: dev + hostname: geocodes-dev + description: GeoCodes is... + name: Geocodes Science on Schema + url: https://www.earthcube.org + logo: https://unsplash.com/random + graph: + main_namespace: test + summary_namespace: test_summary + sources: + - iris + - opentopography +###### + - community: geocodesall + hostname: geocodes-all + description: GeoCodes is... + name: Geocodes Science on Schema + url: https://www.earthcube.org + logo: https://unsplash.com/random + graph: + main_namespace: geocodes_test + summary_namespace: geocodes_test_summary + sources: + - all diff --git a/dagster/implnets/workflows/ingest/data/tennant_dev.yaml b/dagster/implnets/workflows/ingest/data/tennant_dev.yaml new file mode 100644 index 00000000..5a83443c --- /dev/null +++ b/dagster/implnets/workflows/ingest/data/tennant_dev.yaml @@ -0,0 +1,28 @@ +# prototype tennants file + +tennant: + - community: dev + hostname: geocodes-dev + description: GeoCodes is... + name: Geocodes Science on Schema + url: https://www.earthcube.org + logo: https://unsplash.com/random + graph: + main_namespace: test + summary_namespace: test_summary + sources: + - iris + - opentopography +###### + - community: geocodesall + hostname: geocodes-all + description: GeoCodes is... + name: Geocodes Science on Schema + url: https://www.earthcube.org + logo: https://unsplash.com/random + graph: + main_namespace: geocodes_test + summary_namespace: geocodes_test_summary + sources: + - all + diff --git a/dagster/implnets/workflows/ingest/data/tennant_prod.yaml b/dagster/implnets/workflows/ingest/data/tennant_prod.yaml new file mode 100644 index 00000000..defc8bd7 --- /dev/null +++ b/dagster/implnets/workflows/ingest/data/tennant_prod.yaml @@ -0,0 +1,40 @@ +# prototype tennants file + +tennant: + - community: production + hostname: geocodes-aws + description: GeoCodes is... + name: Geocodes Science on Schema + url: https://www.earthcube.org + logo: https://unsplash.com/random + graph: + main_namespace: geocodes + summary_namespace: geocodes_summary + sources: + - bcodmo + - r2r +###### + - community: geocodesall + hostname: geocodes-all + description: GeoCodes is... + name: Geocodes Science on Schema + url: https://www.earthcube.org + logo: https://unsplash.com/random + graph: + main_namespace: earthcube + summary_namespace: earthcube_summary + sources: + - all +###### + - community: deepoceans + hostname: oceans + description: this is a test 1 + name: A community description + url: https://www.earthcube.org + logo: https://unsplash.com/random + graph: + main_namespace: oceans + summary_namespace: oceans_summary + sources: + - bcodmo + - r2r diff --git a/dagster/implnets/workflows/ingest/ingest/__init__.py b/dagster/implnets/workflows/ingest/ingest/__init__.py new file mode 100644 index 00000000..0baa2355 --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest/__init__.py @@ -0,0 +1,240 @@ +########### NOTES ON THIS #### +# the resources need to be correct for the code to run, +# * fields need to be defined. they cannot be + +# BlaszegraphResource(), + +# need have definitions. + +# BlazegraphResource( +# GLEANERIO_GRAPH_URL=EnvVar('GLEANERIO_GRAPH_URL'), +# GLEANERIO_GRAPH_NAMESPACE=EnvVar('GLEANERIO_GRAPH_NAMESPACE'), +# ) +#### QUIRKS ### +# if a type is changed in a configuraiton, you need to change all the configs, and not just one. +# so when + +import os + +from dagster import Definitions, load_assets_from_modules, EnvVar +from dagster_aws.s3.resources import S3Resource +from dagster_aws.s3.ops import S3Coordinate +from dagster import ( + AssetSelection, + Definitions, + define_asset_job, +) +from dagster_slack import SlackResource, make_slack_on_run_failure_sensor + +from .resources.graph import BlazegraphResource, GraphResource +from .resources.gleanerio import GleanerioResource +from .resources.gleanerS3 import gleanerS3Resource +from .assets import ( + gleanerio_run, + release_nabu_run +) + +from .jobs.summon_assets import summon_asset_job +from .jobs import ( + summon_asset_job, sources_asset_job, + sources_partitions_def + ,tenant_asset_job, + tenant_namespaces_job, + release_asset_job +) + +jobs = [ +summon_asset_job, sources_asset_job, + tenant_asset_job, + tenant_namespaces_job, + release_asset_job +] +from pydantic import Field + +from . import assets +from .utils import PythonMinioAddress + + +all_assets = load_assets_from_modules([assets]) + +#harvest_job = define_asset_job(name="harvest_job", selection="harvest_and_release") + +from .sensors import ( + release_file_sensor, +release_file_sensor_v2, + sources_sensor, + tenant_names_sensor, + sources_s3_sensor, + tenant_s3_sensor, +#tenant_names_sensor_v2 +) + +slack_on_run_failure = make_slack_on_run_failure_sensor( + os.getenv("SLACK_CHANNEL"), + os.getenv("SLACK_TOKEN") +) +all_sensors = [ + slack_on_run_failure, + # release_file_sensor, +release_file_sensor_v2, + sources_sensor, # original code. Now use a schedule + tenant_names_sensor, + sources_s3_sensor, + tenant_s3_sensor, +#tenant_names_sensor_v2 + ] + +from .sensors.gleaner_summon import sources_schedule + +all_schedules = [sources_schedule] + +def _awsEndpointAddress(url, port=None, use_ssl=True): + if use_ssl: + protocol = "https" + else: + protocol = "http" + if port is not None: + return f"{protocol}://{url}:{port}" + else: + return f"{protocol}://{url}" + +s3=S3Resource( + endpoint_url =_awsEndpointAddress( + EnvVar('GLEANERIO_MINIO_ADDRESS').get_value(), + port=EnvVar('GLEANERIO_MINIO_PORT').get_value(), + use_ssl=EnvVar('GLEANERIO_MINIO_USE_SSL').get_value() + ), + aws_access_key_id=EnvVar('GLEANERIO_MINIO_ACCESS_KEY'), + aws_secret_access_key=EnvVar('GLEANERIO_MINIO_SECRET_KEY') +) +gleaners3=gleanerS3Resource( + # GLEANER_MINIO_BUCKET =EnvVar('GLEANER_MINIO_BUCKET'), + # GLEANER_MINIO_ADDRESS=EnvVar('GLEANER_MINIO_ADDRESS'), + # GLEANER_MINIO_PORT=EnvVar('GLEANER_MINIO_PORT'), + GLEANERIO_MINIO_BUCKET=EnvVar('GLEANERIO_MINIO_BUCKET'), + GLEANERIO_MINIO_ADDRESS=EnvVar('GLEANERIO_MINIO_ADDRESS'), + GLEANERIO_MINIO_PORT=EnvVar('GLEANERIO_MINIO_PORT'), + GLEANERIO_MINIO_USE_SSL=os.environ.get('GLEANERIO_MINIO_USE_SSL', "True"), + GLEANERIO_MINIO_ACCESS_KEY=EnvVar('GLEANERIO_MINIO_ACCESS_KEY'), + GLEANERIO_MINIO_SECRET_KEY=EnvVar('GLEANERIO_MINIO_SECRET_KEY'), + GLEANERIO_CONFIG_PATH=os.environ.get('GLEANERIO_CONFIG_PATH'), + GLEANERIO_SOURCES_FILENAME=os.environ.get('GLEANERIO_SOURCES_FILENAME'), + GLEANERIO_TENANT_FILENAME=os.environ.get('GLEANERIO_TENANT_FILENAME'), + # this is S3. It is the s3 resource + s3=s3 + +) +triplestore=BlazegraphResource( + GLEANERIO_GRAPH_URL=EnvVar('GLEANERIO_GRAPH_URL'), + GLEANERIO_GRAPH_NAMESPACE=EnvVar('GLEANERIO_GRAPH_NAMESPACE'), + gs3=gleaners3, + ) +triplestore_summary=BlazegraphResource( + GLEANERIO_GRAPH_URL=EnvVar('GLEANERIO_GRAPH_URL'), + GLEANERIO_GRAPH_NAMESPACE=EnvVar('GLEANERIO_GRAPH_SUMMARY_NAMESPACE'), + gs3=gleaners3, + ) + +resources = { + "local": { + "gleanerio": GleanerioResource( +# DEBUG=os.environ.get('DEBUG'), + DEBUG_CONTAINER=False, + GLEANERIO_DOCKER_URL=EnvVar('GLEANERIO_DOCKER_URL'), + GLEANERIO_PORTAINER_APIKEY=EnvVar('GLEANERIO_PORTAINER_APIKEY'), + + GLEANERIO_DOCKER_HEADLESS_NETWORK=EnvVar('GLEANERIO_DOCKER_HEADLESS_NETWORK'), + GLEANERIO_HEADLESS_ENDPOINT=EnvVar('GLEANERIO_HEADLESS_ENDPOINT'), + + GLEANERIO_GLEANER_IMAGE=EnvVar('GLEANERIO_GLEANER_IMAGE'), + GLEANERIO_NABU_IMAGE=EnvVar('GLEANERIO_NABU_IMAGE'), + + GLEANERIO_DAGSTER_CONFIG_PATH=EnvVar('GLEANERIO_DAGSTER_CONFIG_PATH'), + + + GLEANERIO_DOCKER_NABU_CONFIG=EnvVar('GLEANERIO_DOCKER_NABU_CONFIG'), + GLEANERIO_DOCKER_GLEANER_CONFIG=EnvVar('GLEANERIO_DOCKER_GLEANER_CONFIG'), + + GLEANERIO_NABU_CONFIG_PATH=EnvVar('GLEANERIO_NABU_CONFIG_PATH'), + GLEANERIO_GLEANER_CONFIG_PATH=EnvVar('GLEANERIO_GLEANER_CONFIG_PATH'), + + GLEANERIO_LOG_PREFIX=EnvVar('GLEANERIO_LOG_PREFIX'), + + GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT=os.environ.get('GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT',600), + GLEANERIO_GRAPH_NAMESPACE=EnvVar('GLEANERIO_GRAPH_NAMESPACE'), + GLEANERIO_GRAPH_SUMMARY_NAMESPACE=EnvVar('GLEANERIO_GRAPH_SUMMARY_NAMESPACE'), + gs3=gleaners3, + # s3=gleanerS3Resource( + # GLEANERIO_MINIO_ADDRESS="oss.geocodes-aws-dev.earthcube.org", + # GLEANERIO_MINIO_PORT=443, + # GLEANERIO_MINIO_USE_SSL=True, + # GLEANERIO_MINIO_BUCKET="test", + # GLEANERIO_MINIO_ACCESS_KEY="worldsbestaccesskey", + # GLEANERIO_MINIO_SECRET_KEY="worldsbestsecretkey", + # ), + triplestore=triplestore, + # triplestore=BlazegraphResource( + # GLEANERIO_GRAPH_URL=EnvVar('GLEANERIO_GRAPH_URL'), + # GLEANERIO_GRAPH_NAMESPACE=EnvVar('GLEANERIO_GRAPH_NAMESPACE'), + # ), + triplestore_summary=triplestore_summary + ), # gleaner + "s3":s3, + "gs3":gleaners3, + "triplestore": triplestore, + "slack": SlackResource(token=EnvVar("SLACK_TOKEN")), + }, + "production": { + "gleanerio": GleanerioResource( + DEBUG_CONTAINER=False, + + GLEANERIO_DOCKER_URL=EnvVar('GLEANERIO_DOCKER_URL'), + GLEANERIO_PORTAINER_APIKEY=EnvVar('GLEANERIO_PORTAINER_APIKEY'), + + GLEANERIO_DOCKER_HEADLESS_NETWORK=EnvVar('GLEANERIO_DOCKER_HEADLESS_NETWORK'), + GLEANERIO_HEADLESS_ENDPOINT=EnvVar('GLEANERIO_HEADLESS_ENDPOINT'), + + GLEANERIO_GLEANER_IMAGE=EnvVar('GLEANERIO_GLEANER_IMAGE'), + GLEANERIO_NABU_IMAGE=EnvVar('GLEANERIO_NABU_IMAGE'), + + GLEANERIO_DAGSTER_CONFIG_PATH=EnvVar('GLEANERIO_DAGSTER_CONFIG_PATH'), + + + GLEANERIO_DOCKER_NABU_CONFIG=EnvVar('GLEANERIO_DOCKER_NABU_CONFIG'), + GLEANERIO_DOCKER_GLEANER_CONFIG=EnvVar('GLEANERIO_DOCKER_GLEANER_CONFIG'), + + GLEANERIO_NABU_CONFIG_PATH=EnvVar('GLEANERIO_NABU_CONFIG_PATH'), + GLEANERIO_GLEANER_CONFIG_PATH=EnvVar('GLEANERIO_GLEANER_CONFIG_PATH'), + + GLEANERIO_LOG_PREFIX=EnvVar('GLEANERIO_LOG_PREFIX'), + + GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT=os.environ.get('GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT',600), + GLEANERIO_GRAPH_NAMESPACE=EnvVar('GLEANERIO_GRAPH_NAMESPACE'), + GLEANERIO_GRAPH_SUMMARY_NAMESPACE=EnvVar('GLEANERIO_GRAPH_SUMMARY_NAMESPACE'), + gs3=gleaners3, + triplestore=triplestore, + triplestore_summary=triplestore_summary, + + + ), # gleaner + # this nees to be s3 so s3 can find it. + "s3":s3, + "gs3":gleaners3, + "triplestore":triplestore, + "slack":SlackResource(token=EnvVar("SLACK_TOKEN")), + }, +} + +deployment_name = os.environ.get("DAGSTER_DEPLOYMENT", "local") + + + +defs = Definitions( + assets=all_assets, + resources=resources[deployment_name], + sensors=all_sensors, + jobs=jobs, + schedules=all_schedules +# jobs=[harvest_job] + +) diff --git a/dagster/implnets/workflows/ingest/ingest/assets/__init__.py b/dagster/implnets/workflows/ingest/ingest/assets/__init__.py new file mode 100644 index 00000000..52c0d72b --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest/assets/__init__.py @@ -0,0 +1,20 @@ +from .gleaner_geocdoes_demo import gleanerio_demo +from .gleaner_summon_assets import ( + gleanerio_run, release_nabu_run, release_summarize, + load_report_s3,load_report_graph,validate_sitemap_url, + bucket_urls, identifier_stats, + graph_stats_report, + SUMMARY_PATH,RELEASE_PATH +) +from .gleaner_sources import ( + gleanerio_orgs, gleanerio_tenants, + gleanerio_sources, + tenant_partitions_def + , sources_partitions_def +) + +from .tenant import ( + TenantOpConfig, TenantConfig, + upload_release,upload_summary, + create_tenant_containers, create_graph_namespaces +) diff --git a/dagster/implnets/workflows/ingest/ingest/assets/gleaner_asset_factory.py b/dagster/implnets/workflows/ingest/ingest/assets/gleaner_asset_factory.py new file mode 100644 index 00000000..1f3bf8ab --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest/assets/gleaner_asset_factory.py @@ -0,0 +1,13 @@ + +from dagster import get_dagster_logger, asset, In, Nothing, Config + +from ..resources import gleanerio +class gleaner(Config): + source: str + + def create_gleaner_asset(self,context): + @asset(name=f"{self.source}_gleaner") + def _gleanerio(): + gleanerio(context, ("gleaner"), self.source) + + return _gleanerio() diff --git a/dagster/implnets/workflows/ingest/ingest/assets/gleaner_geocdoes_demo.py b/dagster/implnets/workflows/ingest/ingest/assets/gleaner_geocdoes_demo.py new file mode 100644 index 00000000..e0c1e9d9 --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest/assets/gleaner_geocdoes_demo.py @@ -0,0 +1,19 @@ +# a test asset to see that all the resource configurations load. +# basically runs the first step, of gleaner on geocodes demo datasets + +from dagster import get_dagster_logger, asset, In, Nothing, Config + +from ..resources.gleanerio import GleanerioResource +@asset(key_prefix="ingest",required_resource_keys={"gleanerio"}) +def gleanerio_demo(context ): + gleaner_resource = foo = context.resources.gleanerio + source="geocodes_demo_datasets" + gleaner = gleaner_resource.execute(context, "gleaner", source ) + context.add_output_metadata( + metadata={ + "source": source, # Metadata can be any key-value pair + "run": "gleaner", + # The `MetadataValue` class has useful static methods to build Metadata + } + ) + diff --git a/dagster/implnets/workflows/ingest/ingest/assets/gleaner_sources.py b/dagster/implnets/workflows/ingest/ingest/assets/gleaner_sources.py new file mode 100644 index 00000000..dbbf24c4 --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest/assets/gleaner_sources.py @@ -0,0 +1,175 @@ +# a test asset to see that all the resource configurations load. +# basically runs the first step, of gleaner on geocodes demo datasets +import orjson + +import dagster +from dagster import get_dagster_logger, asset,multi_asset, AssetOut, In, Nothing, Config,DynamicPartitionsDefinition, sensor +import yaml +from ec.sitemap import Sitemap + +sources_partitions_def = DynamicPartitionsDefinition(name="sources_names_active") +#from ..resources.gleanerio import GleanerioResource +tenant_partitions_def = DynamicPartitionsDefinition(name="tenant_names_paritition") +### PRESENT HACK. Using the orgs +# really needs to read a future tenant file, and then add +# new partions with a sensor +# need to add a sensor to add paritions when one is added +# https://docs.dagster.io/concepts/partitions-schedules-sensors/partitioning-assets#dynamically-partitioned-assets + +# for right now, using a list of orgs as the sources. +# future read the gleaner config file. +# future future, store sources in (s3/googlesheets) and read them. + + +@asset( + #group_name="configs", + name="org_names",key_prefix="ingest",required_resource_keys={"gs3"}) +def gleanerio_orgs(context ): + s3_resource = context.resources.gs3 + source="orgs_list_from_a_s3_bucket" + files = s3_resource.listPath(path='orgs') + orgs = list(map(lambda o: o["Key"].removeprefix("orgs/").removesuffix(".nq") , files)) + dagster.get_dagster_logger().info(str(orgs)) + context.add_output_metadata( + metadata={ + "source": source, # Metadata can be any key-value pair + "run": "gleaner", + # The `MetadataValue` class has useful static methods to build Metadata + } + ) + #return orjson.dumps(orgs, option=orjson.OPT_INDENT_2) + # this is used for partitioning, so let it pickle (aka be a python list) + return orgs +#@asset(group_name="configs",name="tenant_names",required_resource_keys={"gs3"}) +@multi_asset( + + outs= + { + "tenant_all": AssetOut(key_prefix="ingest", + group_name="configs",), + "tenant_names": AssetOut(key_prefix="ingest", + group_name="configs",), + } + ,required_resource_keys={"gs3"} + ) +def gleanerio_tenants(context): + gleaner_resource = context.resources.gs3 + s3_resource = context.resources.gs3 + # tennant_path = f'{s3_resource.GLEANERIO_CONFIG_PATH}{s3_resource.GLEANERIO_TENANT_FILENAME}' + # get_dagster_logger().info(f"tennant_path {tennant_path} ") + # + # tennant = s3_resource.getFile(path=tennant_path) + tenant = s3_resource.getTennatFile() + get_dagster_logger().info(f"tenant {tenant} ") + tenant_obj = yaml.safe_load(tenant) + tenants = list(map(lambda t: t["community"], tenant_obj["tenant"] )) + context.add_output_metadata( + metadata={ + "source": tenants, # Metadata can be any key-value pair + "run": "gleaner", + # The `MetadataValue` class has useful static methods to build Metadata + }, output_name="tenant_all" + ) + context.add_output_metadata( + metadata={ + "source": tenants, # Metadata can be any key-value pair + "run": "gleaner", + # The `MetadataValue` class has useful static methods to build Metadata + }, output_name="tenant_names" + ) + #return orjson.dumps(orgs, option=orjson.OPT_INDENT_2) + # this is used for partitioning, so let it pickle (aka be a python list) + return tenant_obj, tenants + +""" +check a soruce list, return invalid and valid sources lists +""" +def check_for_valid_sitemap( sources_active): + validated_sources=[] + for source in sources_active: + try: + sm = Sitemap(source['url'], no_progress_bar=True) + + source['sm_url_is_valid'] = sm.validUrl() + validated_sources.append(source) + get_dagster_logger().info(f" sitemap url valid {source['sm_url_is_valid']} for {source['name']} {source['url']}") + except Exception as e: + get_dagster_logger().error(f" sitemap url ERROR for {source['name']} {source['url']} exception {e}") + source['sm_url_is_valid'] = False + validated_sources.append(source) + return validated_sources +@multi_asset( + + outs= + { + "sources_all": AssetOut(key_prefix="ingest", + group_name="configs",), + "sources_names_active": AssetOut(key_prefix="ingest", + group_name="configs",), +"sources_names_invalid_sitemap": AssetOut(key_prefix="ingest", + group_name="configs",), + } + ,required_resource_keys={"gs3"}) +def gleanerio_sources(context ): + + s3_resource = context.resources.gs3 + # tennant_path = f'{s3_resource.GLEANERIO_CONFIG_PATH}{s3_resource.GLEANERIO_TENANT_FILENAME}' + # get_dagster_logger().info(f"tennant_path {tennant_path} ") + # + # tennant = s3_resource.getFile(path=tennant_path) + source = s3_resource.getSourcesFile() + get_dagster_logger().info(f"sources {source} ") + sources_obj = yaml.safe_load(source) + sources_all_value = list(filter(lambda t: t["name"], sources_obj["sources"])) + sources_active_value = filter(lambda t: t["active"], sources_all_value ) + source_sm_validated = list(check_for_valid_sitemap( sources_active_value)) + context.log.info(f"validated sitemaps {source_sm_validated} ") + sources_active_names = list(map(lambda t: t["name"], filter(lambda t: t["sm_url_is_valid"], source_sm_validated ))) + sources_invalid_sm = list(map(lambda t: t["name"], filter(lambda t: not t["sm_url_is_valid"], source_sm_validated))) + + context.add_output_metadata( + metadata={ + "source": sources_active_names, # Metadata can be any key-value pair + "run": "gleaner", + # The `MetadataValue` class has useful static methods to build Metadata + }, output_name="sources_names_active" + ) + #return orjson.dumps(orgs, option=orjson.OPT_INDENT_2) + # this is used for partitioning, so let it pickle (aka be a python list) + return sources_all_value, sources_active_names,sources_invalid_sm +# @asset(required_resource_keys={"gs3"}) +# def gleanerio_orgs(context ): +# s3_resource = context.resources.gs3 +# source="geocodes_demo_datasets" +# files = s3_resource.listPath(path='orgs') +# orgs = list(map(lambda o: o["Key"].removeprefix("orgs/").removesuffix(".nq") , files)) +# # rather than do this with an @asset_sensor, just do it here. +# sources = orgs +# new_sources = [ +# source +# for source in sources +# if not sources_partitions_def.has_partition_key( +# source, dynamic_partitions_store=context.instance +# ) +# ] +# sources_partitions_def.build_add_request(new_sources) +# # return SensorResult( +# # run_requests=[ +# # RunRequest(partition_key=source) for source in new_sources +# # ], +# # dynamic_partitions_requests=[ +# # sources_partitions_def.build_add_request(new_sources) +# # ], +# # ) +# dagster.get_dagster_logger().info(str(orgs)) +# context.add_output_metadata( +# metadata={ +# "source": source, # Metadata can be any key-value pair +# "new_sources":new_sources, +# "run": "gleaner", +# # The `MetadataValue` class has useful static methods to build Metadata +# } +# ) +# #return orjson.dumps(orgs, option=orjson.OPT_INDENT_2) +# # this is used for partitioning, so let it pickle (aka be a python list) +# return orgs diff --git a/dagster/implnets/workflows/ingest/ingest/assets/gleaner_summon_assets.py b/dagster/implnets/workflows/ingest/ingest/assets/gleaner_summon_assets.py new file mode 100644 index 00000000..c5d6e375 --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest/assets/gleaner_summon_assets.py @@ -0,0 +1,437 @@ +# a test asset to see that all the resource configurations load. +# basically runs the first step, of gleaner on geocodes demo datasets +from typing import Any +import json +import pandas as pd +import csv +from urllib.error import HTTPError + +from dagster import ( + asset,op, Config, Output,AssetKey, + define_asset_job, AssetSelection, +get_dagster_logger,BackfillPolicy +) +from ec.datastore import s3 as utils_s3 +from ec.sitemap import Sitemap +from .gleaner_sources import sources_partitions_def +from ..utils import PythonMinioAddress + +from ec.gleanerio.gleaner import getGleaner, getSitemapSourcesFromGleaner, endpointUpdateNamespace +from ec.reporting.report import missingReport, generateIdentifierRepo, generateGraphReportsRelease +from ec.graph.release_graph import ReleaseGraph +from ec.summarize import summaryDF2ttl, get_summary4graph, get_summary4repoSubset + +SUMMARY_PATH = 'graphs/summary' +RELEASE_PATH = 'graphs/latest' + +class HarvestOpConfig(Config): + source_name: str +# sources_partitions_def = StaticPartitionsDefinition( +# ["geocodes_demo_datasets", "iris"] +# ) + +def getSource(context, source_name): + sources = context.repository_def.load_asset_value(AssetKey(["ingest","sources_all"])) + source = list(filter(lambda t: t["name"]==source_name, sources)) + return source[0] + +@asset( + group_name="load", + key_prefix="ingest", + deps=[AssetKey(["ingest","sources_names_active"]) ], + partitions_def=sources_partitions_def, required_resource_keys={"gleanerio"} + # , backfill_policy=BackfillPolicy.single_run() + ) +def validate_sitemap_url(context): + source_name = context.asset_partition_key_for_output() + source = getSource(context, source_name) + sm = Sitemap(source['url'], no_progress_bar=True) + if sm.validUrl(): + return source['url'] + else: + context.log.error(f"source: {source['name']} bad url: {source['url']}") + raise HTTPError(url=source['url'], + code=404, + hdrs=None, + fp=None, + msg=f"Bad URL ource: {source['name']} bad url: {source['url']}" ) + +@asset(group_name="load", +key_prefix="ingest", +op_tags={"ingest": "docker"}, + deps=[ validate_sitemap_url ], + partitions_def=sources_partitions_def, required_resource_keys={"gleanerio"} + # , backfill_policy=BackfillPolicy.single_run() + ) +#@asset( required_resource_keys={"gleanerio"}) +def gleanerio_run(context ) -> Output[Any]: + gleaner_resource = context.resources.gleanerio + source= context.asset_partition_key_for_output() + gleaner = gleaner_resource.execute(context, "gleaner", source ) + + metadata={ + "source": source, # Metadata can be any key-value pair + "run": "gleaner", + # The `MetadataValue` class has useful static methods to build Metadata + } + + return Output(gleaner, metadata=metadata) +@asset(group_name="load", +key_prefix="ingest", +op_tags={"ingest": "docker"}, + deps=[gleanerio_run], + partitions_def=sources_partitions_def, required_resource_keys={"gleanerio"} + # ,backfill_policy=BackfillPolicy.single_run() + ) +#@asset(required_resource_keys={"gleanerio"}) +def release_nabu_run(context) -> Output[Any]: + gleaner_resource = context.resources.gleanerio + source= context.asset_partition_key_for_output() + nabu=gleaner_resource.execute(context, "release", source ) + metadata={ + "source": source, # Metadata can be any key-value pair + "run": "release", + "bucket_name": gleaner_resource.gs3.GLEANERIO_MINIO_BUCKET, # Metadata can be any key-value pair + "object_name": f"{RELEASE_PATH}{source}" + # The `MetadataValue` class has useful static methods to build Metadata + } + + return Output(nabu, metadata=metadata) +''' Return results of summoning the JSON-LD SOS from a source. +This includes the number of url in the sitemap, how many jsonLD were 'summoned' +There may be multiple json-ld per web page, so this needs to be monitored over time. +And how many made it into milled (this is how good the conversion at a single jsonld to RDF is. + +''' + +@asset( +key_prefix="ingest", + group_name="load", +op_tags={"ingest": "report"}, + deps=[gleanerio_run], partitions_def=sources_partitions_def, required_resource_keys={"gleanerio"} + # , backfill_policy=BackfillPolicy.single_run() +) +def load_report_s3(context): + gleaner_resource = context.resources.gleanerio + s3_resource = context.resources.gleanerio.gs3.s3 + gleaner_s3 = context.resources.gleanerio.gs3 + source_name = context.asset_partition_key_for_output() + # source = getSitemapSourcesFromGleaner(gleaner_resource.GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source_name) + source = getSource(context, source_name) + source_url = source.get('url') + s3Minio = utils_s3.MinioDatastore(PythonMinioAddress(gleaner_s3.GLEANERIO_MINIO_ADDRESS, + gleaner_s3.GLEANERIO_MINIO_PORT), + gleaner_s3.MinioOptions() + ) + bucket = gleaner_s3.GLEANERIO_MINIO_BUCKET + + graphendpoint = None + milled = False + summon = True + returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=summon) + r = str('load repoort returned value:{}'.format(returned_value)) + report = json.dumps(returned_value, indent=2) + s3Minio.putReportFile(bucket, source_name, "load_report_s3.json", report) + get_dagster_logger().info(f"load s3 report returned {r} ") + return + + +''' Return results of what JSON-LD SOS is the S3 store, and compares it to the 'Named' graphs +in the graph store. This extends the load report s3. +This includes the number of url in the sitemap, how many jsonLD were 'summoned' +There may be multiple json-ld per web page, so this needs to be monitored over time. +And how many made it into milled (this is how good the conversion at a single jsonld to RDF is. +It then compares what identifiers are in the S3 store (summon path), and the Named Graph URI's +''' + +@asset( +key_prefix="ingest", + group_name="load", +op_tags={"ingest": "report"}, + deps=[release_nabu_run], partitions_def=sources_partitions_def, required_resource_keys={"gleanerio"} + # , backfill_policy=BackfillPolicy.single_run() +) +def load_report_graph(context): + gleaner_resource = context.resources.gleanerio + s3_resource = context.resources.gleanerio.gs3.s3 + gleaner_s3 = context.resources.gleanerio.gs3 + gleaner_triplestore = context.resources.gleanerio.triplestore + + source_name = context.asset_partition_key_for_output() + # source = getSitemapSourcesFromGleaner(gleaner_resource.GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source_name) + source = getSource(context, source_name) + source_url = source.get('url') + s3Minio = utils_s3.MinioDatastore(PythonMinioAddress(gleaner_s3.GLEANERIO_MINIO_ADDRESS, + gleaner_s3.GLEANERIO_MINIO_PORT), + gleaner_s3.MinioOptions() + ) + bucket = gleaner_s3.GLEANERIO_MINIO_BUCKET + + graphendpoint = gleaner_triplestore.GraphEndpoint(gleaner_resource.GLEANERIO_GRAPH_NAMESPACE) + milled = False + summon = True + returned_value = missingReport(source_url, bucket, source_name, s3Minio, graphendpoint, milled=milled, summon=False) # summon false. we want the graph + r = str('load repoort graph returned value:{}'.format(returned_value)) + report = json.dumps(returned_value, indent=2) + s3Minio.putReportFile(bucket, source_name, "load_report_graph.json", report) + get_dagster_logger().info(f"load report to graph returned {r} ") + return +class S3ObjectInfo: + bucket_name="" + object_name="" +@asset(group_name="load",key_prefix="ingest", + name="release_summarize", + deps=[release_nabu_run], partitions_def=sources_partitions_def, required_resource_keys={"gleanerio"} + # , backfill_policy=BackfillPolicy.single_run() + ) +def release_summarize(context) : + gleaner_resource = context.resources.gleanerio + s3_resource = context.resources.gleanerio.gs3.s3 + gleaner_s3 = context.resources.gleanerio.gs3 + triplestore =context.resources.gleanerio.triplestore + source_name = context.asset_partition_key_for_output() + #source = getSitemapSourcesFromGleaner(gleaner_resource.GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source_name) + source = getSource(context,source_name) + source_url = source.get('url') + s3Minio = utils_s3.MinioDatastore(PythonMinioAddress(gleaner_s3.GLEANERIO_MINIO_ADDRESS, + gleaner_s3.GLEANERIO_MINIO_PORT), + gleaner_s3.MinioOptions() + ) + bucket = gleaner_s3.GLEANERIO_MINIO_BUCKET + + # endpoint = triplestore.GraphEndpoint# getting data, not uploading data + #summary_namespace = _graphSummaryEndpoint() + + try: + + # summarydf = get_summary4repoSubset(endpoint, source_name) + rg = ReleaseGraph() + rg.read_release(PythonMinioAddress(gleaner_s3.GLEANERIO_MINIO_ADDRESS, + gleaner_s3.GLEANERIO_MINIO_PORT), + bucket, + source_name, + options=gleaner_s3.MinioOptions()) + summarydf = rg.summarize() + nt, g = summaryDF2ttl(summarydf, source_name) # let's try the new generator + summaryttl = g.serialize(format='longturtle') + # Lets always write out file to s3, and insert as a separate process + # we might be able to make this an asset..., but would need to be acessible by http + # if not stored in s3 + objectname = f"{SUMMARY_PATH}/{source_name}_release.ttl" # needs to match that is expected by post + s3ObjectInfo = S3ObjectInfo() + s3ObjectInfo.bucket_name = bucket + s3ObjectInfo.object_name = objectname + + bucket_name, object_name =s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo) + context.add_output_metadata( + metadata={ + "source": source, # Metadata can be any key-value pair + "run": "release_summarize", + "bucket_name": bucket_name, # Metadata can be any key-value pair + "object_name": object_name, + # The `MetadataValue` class has useful static methods to build Metadata + } + ) + # inserted = sumnsgraph.insert(bytes(summaryttl, 'utf-8'), content_type="application/x-turtle") + # if not inserted: + # raise Exception("Loading to graph failed.") + except Exception as e: + # use dagster logger + get_dagster_logger().error(f"Summary. Issue creating graph {str(e)} ") + raise Exception(f"Loading Summary graph failed. {str(e)}") + return 1 + + return + +@asset(group_name="load",key_prefix="ingest", + deps=[gleanerio_run], +op_tags={"ingest": "report"}, + partitions_def=sources_partitions_def, required_resource_keys={"gleanerio"} + # , backfill_policy=BackfillPolicy.single_run() + ) +def identifier_stats(context): + gleaner_resource = context.resources.gleanerio + s3_resource = context.resources.gleanerio.gs3.s3 + gleaner_s3 = context.resources.gleanerio.gs3 + triplestore =context.resources.gleanerio.triplestore + source_name = context.asset_partition_key_for_output() + # source = getSitemapSourcesFromGleaner(gleaner_resource.GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source_name) + source = getSource(context, source_name) + source_url = source.get('url') + s3Minio = utils_s3.MinioDatastore(PythonMinioAddress(gleaner_s3.GLEANERIO_MINIO_ADDRESS, + gleaner_s3.GLEANERIO_MINIO_PORT), + gleaner_s3.MinioOptions() + ) + bucket = gleaner_s3.GLEANERIO_MINIO_BUCKET + + + returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) + r = str('returned value:{}'.format(returned_value)) + #r = str('identifier stats returned value:{}'.format(returned_value)) + report = returned_value.to_json() + s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) + get_dagster_logger().info(f"identifer stats report returned {r} ") + return + +@asset(group_name="load",key_prefix="ingest", + deps=[gleanerio_run], +op_tags={"ingest": "report"}, + partitions_def=sources_partitions_def, required_resource_keys={"gleanerio"} + # , backfill_policy=BackfillPolicy.single_run() + ) +def bucket_urls(context): + gleaner_resource = context.resources.gleanerio + s3_resource = context.resources.gleanerio.gs3.s3 + gleaner_s3 = context.resources.gleanerio.gs3 + triplestore =context.resources.gleanerio.triplestore + source_name = context.asset_partition_key_for_output() + # source = getSitemapSourcesFromGleaner(gleaner_resource.GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source_name) + source = getSource(context, source_name) + source_url = source.get('url') + s3Minio = utils_s3.MinioDatastore(PythonMinioAddress(gleaner_s3.GLEANERIO_MINIO_ADDRESS, + gleaner_s3.GLEANERIO_MINIO_PORT), + gleaner_s3.MinioOptions() + ) + bucket = gleaner_s3.GLEANERIO_MINIO_BUCKET + + + res = s3Minio.listSummonedUrls(bucket, source_name) + r = str('returned value:{}'.format(res)) + bucketurls = pd.DataFrame(res).to_csv(index=False, quoting=csv.QUOTE_NONNUMERIC) + s3Minio.putReportFile(bucket, source_name, "bucketutil_urls.csv", bucketurls) + get_dagster_logger().info(f"bucker urls report returned {r} ") + return + +# original code. inlined. +# def _releaseUrl( source, path=RELEASE_PATH, extension="nq"): +# proto = "http" +# if GLEANER_MINIO_USE_SSL: +# proto = "https" +# address = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) +# bucket = GLEANER_MINIO_BUCKET +# release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" +# return release_url +@asset(group_name="load",key_prefix="ingest", + deps=[release_nabu_run], +op_tags={"ingest": "report"}, + partitions_def=sources_partitions_def, required_resource_keys={"gleanerio"} + # , backfill_policy=BackfillPolicy.single_run() + ) +def graph_stats_report(context) : + gleaner_resource = context.resources.gleanerio + s3_resource = context.resources.gleanerio.gs3.s3 + gleaner_s3 = context.resources.gleanerio.gs3 + triplestore = context.resources.gleanerio.triplestore + source_name = context.asset_partition_key_for_output() + # source = getSitemapSourcesFromGleaner(gleaner_resource.GLEANERIO_GLEANER_CONFIG_PATH, sourcename=source_name) + source = getSource(context, source_name) + source_url = source.get('url') + s3Minio = utils_s3.MinioDatastore(PythonMinioAddress(gleaner_s3.GLEANERIO_MINIO_ADDRESS, + gleaner_s3.GLEANERIO_MINIO_PORT), + gleaner_s3.MinioOptions() + ) + bucket = gleaner_s3.GLEANERIO_MINIO_BUCKET + + #returned_value = generateGraphReportsRepo(source_name, graphendpoint, reportList=reportTypes["repo_detailed"]) + proto = "http" + if gleaner_s3.GLEANERIO_MINIO_USE_SSL: + proto = "https" + address = PythonMinioAddress(gleaner_s3.GLEANERIO_MINIO_ADDRESS, gleaner_s3.GLEANERIO_MINIO_PORT) + + s3FileUrl = f"{proto}://{address}/{bucket}/{RELEASE_PATH}/{source_name}_release.nq" + #s3FileUrl = _releaseUrl(source_name ) + get_dagster_logger().info(f"get release for {source_name} from returned {s3FileUrl} ") + returned_value = generateGraphReportsRelease(source_name,s3FileUrl) + r = str('returned value:{}'.format(returned_value)) + #report = json.dumps(returned_value, indent=2) # value already json.dumps + report = returned_value + s3Minio.putReportFile(bucket, source_name, "graph_stats.json", report) + get_dagster_logger().info(f"graph stats returned {r} ") + return + +#might need to use this https://docs.dagster.io/_apidocs/repositories#dagster.RepositoryDefinition.get_asset_value_loader +#@sensor(job=summon_asset_job) +# @sensor(asset_selection=AssetSelection.keys("gleanerio_orgs")) +# def sources_sensor(context ): +# sources = gleanerio_orgs +# new_sources = [ +# source +# for source in sources +# if not sources_partitions_def.has_partition_key( +# source, dynamic_partitions_store=context.instance +# ) +# ] +# +# return SensorResult( +# run_requests=[ +# RunRequest(partition_key=source) for source in new_sources +# ], +# dynamic_partitions_requests=[ +# sources_partitions_def.build_add_request(new_sources) +# ], +# ) + +# need to add a sensor to add paritions when one is added +# https://docs.dagster.io/concepts/partitions-schedules-sensors/partitioning-assets#dynamically-partitioned-assets + + +# ######### +# CRUFT +# worked to see if this could be a graph with an assent, and really a defiend asset job works better + +# ## partitioning +# #### +# class HarvestOpConfig(Config): +# source_name: str +# @dynamic_partitioned_config(partition_fn=gleanerio_orgs) +# def harvest_config(partition_key: str): +# return { +# "ops": +# {"harvest_and_release": +# {"config": {"source_name": partition_key}, +# "ops": { +# "gleanerio_run": +# {"config": {"source_name": partition_key} +# }, +# "nabu_release_run": +# {"config": {"source_name": partition_key} +# } +# } +# } +# } +# } +# +# # ops: +# # harvest_and_release: +# # ops: +# # gleanerio_run: +# # config: +# # source_name: "" +# # nabu_release_run: +# # config: +# # source_name: "" +# +# @graph_asset(partitions_def=sources_partitions_def) +# #@graph_asset( ) +# def harvest_and_release() : +# #source = context.asset_partition_key_for_output() +# #containers = getImage() +# #harvest = gleanerio_run(start=containers) +# harvest = gleanerio_run() +# release = nabu_release_run(harvest) +# return release +# +# #@asset +# # def harvest_op(context, config: HarvestOpConfig): +# # context.log.info(config.source_name) +# # harvest = gleanerio_run() +# # release = nabu_release_run(harvest) +# # return release +# +# # @job(config=harvest_config) +# # def harvest_job( ): +# # harvest_op() +# #harvest_and_release() +# # @schedule(cron_schedule="0 0 * * *", job=harvest_job) +# # def geocodes_schedule(): +# # return RunRequest(partition_key="iris") diff --git a/dagster/implnets/workflows/ingest/ingest/assets/tenant.py b/dagster/implnets/workflows/ingest/ingest/assets/tenant.py new file mode 100644 index 00000000..5259c52a --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest/assets/tenant.py @@ -0,0 +1,185 @@ +from dagster import ( + op, job, Config,asset, + In, Nothing, + sensor, RunRequest, RunConfig, + SensorEvaluationContext, asset_sensor, EventLogEntry, + SkipReason, + AssetKey, + static_partitioned_config, dynamic_partitioned_config, DynamicPartitionsDefinition, + define_asset_job, AssetSelection,graph_asset, + get_dagster_logger +) + +from dagster_aws.s3.sensor import get_s3_keys +from typing import List, Dict +from pydantic import Field +import pydash + +#from pydash.collections import find +#from pydash.predicates import is_match +from ec.graph.manageGraph import ManageBlazegraph +from ..assets import gleanerio_tenants, tenant_partitions_def, sources_partitions_def +from .gleaner_summon_assets import RELEASE_PATH, SUMMARY_PATH + +class TenantConfig(Config): + source_name: str + name: str + source_list: List[str] + TENANT_GRAPH_NAMESPACE: str + TENANT_GRAPH_SUMMARY_NAMESPACE: str + SUMMARY_PATH: str = Field( + description="GLEANERIO_GRAPH_SUMMARY_PATH.", default='graphs/summary') + RELEASE_PATH : str = Field( + description="GLEANERIO_GRAPH_RELEASE_PATH.", default='graphs/latest') + + +class TenantOpConfig(Config): + source_name: str + +def find_tenants_with_source(context, source_name, tenats_all): + get_dagster_logger().info(f" find tenant {source_name} with {tenats_all}") + tenants =[] + # tenants = pydash.collections.find(tenats_all, + # lambda t: p ydash.predicates.is_match(t["sources"], source_name) or pydash.predicates.is_match(t["sources"], 'all') + # ) + #tenants = pydash.collections.find(tenats_all, lambda t: pydash.predicates.is_match(t["sources"], "all") ) + for tenant in tenats_all: + get_dagster_logger().info(f" {tenant['community']} sources {tenant['sources']}") + if source_name in tenant["sources"]: + get_dagster_logger().info(f" found source {source_name} in {tenant['community']}") + tenants.append(tenant) + if 'all' in tenant["sources"]: + get_dagster_logger().info(f" found source all in {tenant['community']}") + tenants.append(tenant) + context.log.info(f" source {source_name} in {tenants}") + return tenants +@asset( + group_name="tenant_load",key_prefix="ingest", +op_tags={"ingest": "graph"}, + deps=[AssetKey(["ingest","tenant_names"]), AssetKey(["ingest","tenant_all"])], + required_resource_keys={"gleanerio",} + ,partitions_def=sources_partitions_def +) +#def upload_release(context, config:TennantOpConfig ): +def upload_release(context ): + #context.log.info(config.source_name) + tenants_all = context.repository_def.load_asset_value(AssetKey(["ingest","tenant_all"]))['tenant'] + source_name = context.asset_partition_key_for_output() + + context.log.info(f"source_name {source_name}") + gleaner_resource = context.resources.gleanerio + s3_resource = context.resources.gleanerio.gs3.s3 + gleaner_s3 = context.resources.gleanerio.gs3 + triplestore = context.resources.gleanerio.triplestore + tenants = find_tenants_with_source(context, source_name, tenants_all) + for tenant in tenants: + #tenant["graph"]['main_namespace'] + #bg = ManageBlazegraph(triplestore.GLEANERIO_GRAPH_URL, tenant["graph"]['main_namespace']) + try: + #bg.upload_nq_file() + namespace = tenant['graph']['main_namespace'] + endpoint = triplestore.GraphEndpoint(namespace) + triplestore.post_to_graph(source_name, path=RELEASE_PATH, extension="nq", graphendpoint=endpoint) + context.log.info(f"load release for {source_name} to tenant {tenant['community']} {endpoint} ") + except Exception as ex: + context.log.info(f"load to tenant {source_name} failed to {endpoint} {ex}") + raise Exception(f"load to tenant {source_name} failed to {endpoint} {ex}") + return + +#@asset(required_resource_keys={"gleanerio",},ins={"start": In(Nothing)}) +@asset(group_name="tenant_load",key_prefix="ingest", +op_tags={"ingest": "graph"}, + deps=[AssetKey(["ingest","tenant_names"]), AssetKey(["ingest","tenant_all"])], + required_resource_keys={"gleanerio",} + ,partitions_def=sources_partitions_def + ) +#def upload_summary(context, config:TennantOpConfig): +def upload_summary(context): + #context.log.info(config.source_name) + source_name = context.asset_partition_key_for_output() + context.log.info(f"tennant_name {source_name} ") + tenants_all = context.repository_def.load_asset_value(AssetKey(["ingest","tenant_all"]))['tenant'] + + gleaner_resource = context.resources.gleanerio + s3_resource = context.resources.gleanerio.gs3.s3 + gleaner_s3 = context.resources.gleanerio.gs3 + triplestore = context.resources.gleanerio.triplestore + tenants = find_tenants_with_source(context,source_name, tenants_all) + for tenant in tenants: + # bg = ManageBlazegraph(triplestore.GLEANERIO_GRAPH_URL, tenant["graph"]['summary_namespace']) + try: + # bg.upload_nq_file() + namespace = tenant['graph']['summary_namespace'] + endpoint = triplestore.GraphEndpoint(namespace) + triplestore.post_to_graph(source_name, path=SUMMARY_PATH,extension="ttl", graphendpoint=endpoint) + context.log.info(f"load summary for {source_name} to tenant {tenant['community']} {endpoint}") + except Exception as ex: + context.log.error(f"load to tenant failed {source_name} {endpoint} {ex}") + raise Exception(f"load to tenant failed {source_name} {endpoint} {ex}") + return +# +# @asset(group_name="tenant_create",required_resource_keys={"gleanerio",},partitions_def=tenant_partitions_def) +# def create_graph_namespaces(context): +# #context.log.info(config.source_name) +# source_name = context.asset_partition_key_for_output() +# context.log.info(f"tennant_name {source_name}") +# gleaner_resource = context.resources.gleanerio +# s3_resource = context.resources.gleanerio.gs3.s3 +# gleaner_s3 = context.resources.gleanerio.gs3 +# triplestore = context.resources.gleanerio.triplestore +# pass +@asset(group_name="tenant_create",key_prefix="ingest", + deps=[AssetKey(["ingest","tenant_all"])], +op_tags={"ingest": "graph"}, + required_resource_keys={"gleanerio",},partitions_def=tenant_partitions_def) +def create_graph_namespaces(context): + #context.log.info(config.source_name) + tenant_name = context.asset_partition_key_for_output() + context.log.info(f"tennant_name {tenant_name}") + tenants = context.repository_def.load_asset_value(AssetKey(["ingest","tenant_all"])) + # from https://stackoverflow.com/questions/2361426/get-the-first-item-from-an-iterable-that-matches-a-condition + tenant = next((t for t in tenants["tenant"] if t['community'] == tenant_name ),None) + if tenant is None: + raise Exception("Tenant with name {} does not exist".format(tenant_name)) + context.log.info(f"tennant {tenant}") + # should we put a default. + main_namespace = tenant["graph"]["main_namespace"] + summary_namespace = tenant["graph"]["summary_namespace"] + gleaner_resource = context.resources.gleanerio + s3_resource = context.resources.gleanerio.gs3.s3 + gleaner_s3 = context.resources.gleanerio.gs3 + triplestore = context.resources.gleanerio.triplestore + bg = ManageBlazegraph(triplestore.GLEANERIO_GRAPH_URL, main_namespace ) + bg_summary = ManageBlazegraph(triplestore.GLEANERIO_GRAPH_URL, summary_namespace) + try: + msg = bg.createNamespace(quads=True) + context.log.info(f"graph creation {tenant_name} {triplestore.GLEANERIO_GRAPH_URL} {msg}") + msg = bg_summary.createNamespace(quads=False) + context.log.info(f"graph creation {tenant_name} {triplestore.GLEANERIO_GRAPH_URL} {msg}") + except Exception as ex : + context.log.error(f"graph creation failed {tenant_name} {triplestore.GLEANERIO_GRAPH_URL} {ex}") + raise Exception(f"graph creation failed {tenant_name} {triplestore.GLEANERIO_GRAPH_URL} {ex}") + return + +@asset(group_name="tenant_create",key_prefix="ingest", + deps=[AssetKey(["ingest","tenant_all"]), AssetKey(["ingest","create_graph_namespaces"])], + required_resource_keys={"gleanerio",},partitions_def=tenant_partitions_def) +def create_tenant_containers(context): + #context.log.info(config.source_name) + tenant_name = context.asset_partition_key_for_output() + tenants = context.repository_def.load_asset_value(AssetKey(["ingest","tenant_all"])) + context.log.info(f"tennant_name {tenant_name}") + gleaner_resource = context.resources.gleanerio + s3_resource = context.resources.gleanerio.gs3.s3 + gleaner_s3 = context.resources.gleanerio.gs3 + triplestore = context.resources.gleanerio.triplestore + pass +#@static_partitioned_config(partition_keys=TENNANT_NAMES) + + #return {"ops": {"continent_op": {"config": {"continent_name": partition_key}}}} +#@job(config=tennant_config, partitions_def=tenant_partitions_def) +# @job( partitions_def=tenant_partitions_def) +# def build_community(): +# source_name = context.asset_partition_key_for_output() +# context.log.info(f"tennant_name {source_name}") +# upload_summary(upload_release()) diff --git a/dagster/implnets/workflows/ingest/ingest/implnet_jobs_SOURCEVAL.py b/dagster/implnets/workflows/ingest/ingest/implnet_jobs_SOURCEVAL.py new file mode 100644 index 00000000..bd1e2c14 --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest/implnet_jobs_SOURCEVAL.py @@ -0,0 +1,7 @@ +from dagster import job + +from ops.implnet_ops_SOURCEVAL import harvest_SOURCEVAL + +@job +def implnet_job_SOURCEVAL(): + harvest_SOURCEVAL() \ No newline at end of file diff --git a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_r2r.py b/dagster/implnets/workflows/ingest/ingest/implnet_ops_SOURCEVAL.py similarity index 82% rename from dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_r2r.py rename to dagster/implnets/workflows/ingest/ingest/implnet_ops_SOURCEVAL.py index ab34084f..156f27b8 100644 --- a/dagster/implnets/generatedCode/implnet-eco/output/ops/implnet_ops_r2r.py +++ b/dagster/implnets/workflows/ingest/ingest/implnet_ops_SOURCEVAL.py @@ -48,7 +48,7 @@ # env items URL = os.environ.get('PORTAINER_URL') APIKEY = os.environ.get('PORTAINER_KEY') - +CONTAINER_WAIT_TIMEOUT= os.environ.get('GLEANERIO_CONTAINER_WAIT_SECONDS', 5) GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) @@ -138,7 +138,7 @@ def s3reader(object): get_dagster_logger().info(f"S3 read error : {str(err)}") -def s3loader(data, name): +def s3loader(data, name, date_string=datetime.now().strftime("%Y_%m_%d_%H_%M_%S")): secure= GLEANER_MINIO_USE_SSL server = _pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT) @@ -158,8 +158,8 @@ def s3loader(data, name): # else: # print("Bucket 'X' already exists") - now = datetime.now() - date_string = now.strftime("%Y_%m_%d_%H_%M_%S") + # now = datetime.now() + # date_string = now.strftime("%Y_%m_%d_%H_%M_%S") logname = name + '_{}.log'.format(date_string) objPrefix = GLEANERIO_LOG_PREFIX + logname @@ -307,7 +307,7 @@ def gleanerio(context, mode, source): ## ------------ Create returnCode = 0 get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") - + date_string = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") if str(mode) == "gleaner": IMAGE =GLEANERIO_GLEANER_IMAGE @@ -424,13 +424,8 @@ def gleanerio(context, mode, source): data["Env"] = enva data["HostConfig"] = { "NetworkMode": GLEANER_HEADLESS_NETWORK, - # "Binds": [f"{GLEANER_CONFIG_VOLUME}:/configs"] } - # data["Volumes"] = [ - # "dagster-project:/configs" - # ] - # we would like this to be "dagster-${PROJECT:-eco}" but that is a bit tricky - # end setup of data + # docker dagster get_dagster_logger().info(f"start docker code region: ") @@ -468,102 +463,74 @@ def gleanerio(context, mode, source): cid = container.id # legacy til the start get's fixed - - ## ------------ Archive to load, which is how to send in the config (from where?) - - - -# this method of watching the logs, - # do not let a possible issue with container logs stop log upload. - ## I thinkthis happens when a container exits immediately. - try: - for line in container.logs(stdout=True, stderr=True, stream=True, follow=True): - get_dagster_logger().debug(line) # noqa: T201 - except docker.errors.APIError as ex: - - get_dagster_logger().info(f"This is ok. watch container logs failed Docker API ISSUE: {repr(ex)}") - except Exception as ex: - get_dagster_logger().info(f"This is ok. watch container logs failed other issue:{repr(ex)} ") - - - - - # ## ------------ Wait expect 200 - # we want to get the logs, no matter what, so do not exit, yet. - ## or should logs be moved into finally? - ### in which case they need to be methods that don't send back errors. - exit_status = container.wait()["StatusCode"] - get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") - # WE PULL THE LOGS, then will throw an error - returnCode = exit_status - - - - - ## ------------ Copy logs expect 200 - - - c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') - - # write to s3 - - s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object - #s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object - # write to minio (would need the minio info here) - - get_dagster_logger().info(f"container Logs to s3: ") - -## get log files - url = URL + 'containers/' + cid + '/archive' - params = { - 'path': f"{WorkingDir}/logs" - } - query_string = urllib.parse.urlencode(params) - url = url + "?" + query_string - - # print(url) - req = request.Request(url, method="GET") - req.add_header('X-API-Key', APIKEY) - req.add_header('content-type', 'application/x-compressed') - req.add_header('accept', 'application/json') - r = request.urlopen(req) - - log.info(f"{r.status} ") - get_dagster_logger().info(f"Container Archive Retrieved: {str(r.status)}") - # s3loader(r.read().decode('latin-1'), NAME) - s3loader(r.read(), f"{source}_{mode}_runlogs") - # Future, need to extraxct files, and upload +# Removed watching the logs, in favor of periodic upload + wait_count = 0 + while True: + wait_count += 1 + try: + container.wait(timeout=CONTAINER_WAIT_TIMEOUT) + exit_status = container.wait()["StatusCode"] + get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") + # WE PULL THE LOGS, then will throw an error + returnCode = exit_status + c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') + + # write to s3 + + s3loader(str(c).encode(), NAME, date_string=date_string) # s3loader needs a bytes like object + # s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object + # write to minio (would need the minio info here) + + get_dagster_logger().info(f"container Logs to s3: ") +# this needs to be address at some point. https://www.appsloveworld.com/docker/100/85/docker-py-getarchive-destination-folder + path = f"{WorkingDir}/logs" + tar_archive_stream, tar_stat = container.get_archive(path) + archive = bytearray() + for chunk in tar_archive_stream: + archive.extend(chunk) + s3loader(archive, f"{source}_{mode}_runlogs", date_string=date_string) + get_dagster_logger().info(f"uploaded logs : {source}_{mode}_runlogs to {path}") + break + except requests.exceptions.ReadTimeout as ex: + path = f"{WorkingDir}/logs" + tar_archive_stream, tar_stat = container.get_archive(path) + archive = bytearray() + for chunk in tar_archive_stream: + archive.extend(chunk) + s3loader(archive, f"{source}_{mode}_runlogs", date_string=date_string) + get_dagster_logger().info(f"uploaded {wait_count}th log : {source}_{mode}_runlogs to {path}") + except docker.errors.APIError as ex: + get_dagster_logger().info(f"Container Wait docker API error : {str(ex)}") + returnCode = 1 + break + if container.status == 'exited' or container.status == 'removed': + get_dagster_logger().info(f"Container exited or removed. status: {container.status}") + exit_status = container.wait()["StatusCode"] + returnCode = exit_status + s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object + # s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object + # write to minio (would need the minio info here) + + get_dagster_logger().info(f"container Logs to s3: ") + # this needs to be address at some point. https://www.appsloveworld.com/docker/100/85/docker-py-getarchive-destination-folder + path = f"{WorkingDir}/logs" + tar_archive_stream, tar_stat = container.get_archive(path) + archive = bytearray() + for chunk in tar_archive_stream: + archive.extend(chunk) + s3loader(archive, f"{source}_{mode}_runlogs", date_string=date_string) + get_dagster_logger().info(f"uploaded logs : {source}_{mode}_runlogs to {path}") + break + + # ABOVE Future, need to extraxct files, and upload # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) # pw_tar.extractall("extract_to/") - # looks like get_archive also has issues. Returns nothing, - # strm, stat = container.get_archive(f"{WorkingDir}/logs/") - # get_dagster_logger().info(f"container Logs to s3: {str(stat)}") - # - # i =0 - # for d in strm: - # r = d.decode('utf-8') - # # s3loader(r.read().decode('latin-1'), NAME) - # s3loader(r.encode(), f"{source}_{i}_runlogs") - # i+=1 - - # s3loader(r.read().decode('latin-1'), NAME) if exit_status != 0: raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") finally: if (not DEBUG) : - # if (cid): - # url = URL + 'containers/' + cid - # req = request.Request(url, method="DELETE") - # req.add_header('X-API-Key', APIKEY) - # # req.add_header('content-type', 'application/json') - # req.add_header('accept', 'application/json') - # r = request.urlopen(req) - # print(r.status) - # get_dagster_logger().info(f"Container Remove: {str(r.status)}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") if (service): service.remove() get_dagster_logger().info(f"Service Remove: {service.name}") @@ -572,14 +539,7 @@ def gleanerio(context, mode, source): else: get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") - # if (container): - # container.remove(force=True) - # get_dagster_logger().info(f"Container Remove: {container.name}") - # else: - # get_dagster_logger().info(f"Container Not created, so not removed.") - # - # else: - # get_dagster_logger().info(f"Container {container.name} NOT Removed : DEBUG ENABLED") + if (returnCode != 0): get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") @@ -587,7 +547,7 @@ def gleanerio(context, mode, source): return returnCode @op -def r2r_getImage(context): +def SOURCEVAL_getImage(context): run_container_context = DockerContainerContext.create_for_run( context.dagster_run, context.instance.run_launcher @@ -599,54 +559,54 @@ def r2r_getImage(context): client.images.pull(GLEANERIO_GLEANER_IMAGE) client.images.pull(GLEANERIO_NABU_IMAGE) @op(ins={"start": In(Nothing)}) -def r2r_gleaner(context): - returned_value = gleanerio(context, ("gleaner"), "r2r") +def SOURCEVAL_gleaner(context): + returned_value = gleanerio(context, ("gleaner"), "SOURCEVAL") r = str('returned value:{}'.format(returned_value)) get_dagster_logger().info(f"Gleaner returned {r} ") return @op(ins={"start": In(Nothing)}) -def r2r_nabu_prune(context): - returned_value = gleanerio(context,("prune"), "r2r") +def SOURCEVAL_nabu_prune(context): + returned_value = gleanerio(context,("prune"), "SOURCEVAL") r = str('returned value:{}'.format(returned_value)) get_dagster_logger().info(f"nabu prune returned {r} ") return @op(ins={"start": In(Nothing)}) -def r2r_nabuprov(context): - returned_value = gleanerio(context,("prov"), "r2r") +def SOURCEVAL_nabuprov(context): + returned_value = gleanerio(context,("prov"), "SOURCEVAL") r = str('returned value:{}'.format(returned_value)) get_dagster_logger().info(f"nabu prov returned {r} ") return @op(ins={"start": In(Nothing)}) -def r2r_nabuorg(context): - returned_value = gleanerio(context,("orgs"), "r2r") +def SOURCEVAL_nabuorg(context): + returned_value = gleanerio(context,("orgs"), "SOURCEVAL") r = str('returned value:{}'.format(returned_value)) get_dagster_logger().info(f"nabu org load returned {r} ") return @op(ins={"start": In(Nothing)}) -def r2r_naburelease(context): - returned_value = gleanerio(context,("release"), "r2r") +def SOURCEVAL_naburelease(context): + returned_value = gleanerio(context,("release"), "SOURCEVAL") r = str('returned value:{}'.format(returned_value)) get_dagster_logger().info(f"nabu release returned {r} ") return @op(ins={"start": In(Nothing)}) -def r2r_uploadrelease(context): - returned_value = post_to_graph("r2r", extension="nq") +def SOURCEVAL_uploadrelease(context): + returned_value = post_to_graph("SOURCEVAL", extension="nq") r = str('returned value:{}'.format(returned_value)) get_dagster_logger().info(f"upload release returned {r} ") return @op(ins={"start": In(Nothing)}) -def r2r_missingreport_s3(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="r2r") +def SOURCEVAL_missingreport_s3(context): + source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="SOURCEVAL") source_url = source.get('url') s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) bucket = GLEANER_MINIO_BUCKET - source_name = "r2r" + source_name = "SOURCEVAL" graphendpoint = None milled = False summon = True @@ -657,12 +617,12 @@ def r2r_missingreport_s3(context): get_dagster_logger().info(f"missing s3 report returned {r} ") return @op(ins={"start": In(Nothing)}) -def r2r_missingreport_graph(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="r2r") +def SOURCEVAL_missingreport_graph(context): + source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="SOURCEVAL") source_url = source.get('url') s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) bucket = GLEANER_MINIO_BUCKET - source_name = "r2r" + source_name = "SOURCEVAL" graphendpoint = _graphEndpoint()# f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" @@ -676,12 +636,12 @@ def r2r_missingreport_graph(context): get_dagster_logger().info(f"missing graph report returned {r} ") return @op(ins={"start": In(Nothing)}) -def r2r_graph_reports(context) : - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="r2r") +def SOURCEVAL_graph_reports(context) : + source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="SOURCEVAL") #source_url = source.get('url') s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) bucket = GLEANER_MINIO_BUCKET - source_name = "r2r" + source_name = "SOURCEVAL" graphendpoint = _graphEndpoint() # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql" @@ -696,11 +656,11 @@ def r2r_graph_reports(context) : return @op(ins={"start": In(Nothing)}) -def r2r_identifier_stats(context): - source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="r2r") +def SOURCEVAL_identifier_stats(context): + source = getSitemapSourcesFromGleaner(DAGSTER_GLEANER_CONFIG_PATH, sourcename="SOURCEVAL") s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) bucket = GLEANER_MINIO_BUCKET - source_name = "r2r" + source_name = "SOURCEVAL" returned_value = generateIdentifierRepo(source_name, bucket, s3Minio) r = str('returned value:{}'.format(returned_value)) @@ -711,10 +671,10 @@ def r2r_identifier_stats(context): return @op(ins={"start": In(Nothing)}) -def r2r_bucket_urls(context): +def SOURCEVAL_bucket_urls(context): s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) bucket = GLEANER_MINIO_BUCKET - source_name = "r2r" + source_name = "SOURCEVAL" res = s3Minio.listSummonedUrls(bucket, source_name) r = str('returned value:{}'.format(res)) @@ -728,10 +688,10 @@ class S3ObjectInfo: object_name="" @op(ins={"start": In(Nothing)}) -def r2r_summarize(context) : +def SOURCEVAL_summarize(context) : s3Minio = s3.MinioDatastore(_pythonMinioAddress(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) bucket = GLEANER_MINIO_BUCKET - source_name = "r2r" + source_name = "SOURCEVAL" endpoint = _graphEndpoint() # getting data, not uploading data summary_namespace = _graphSummaryEndpoint() @@ -762,20 +722,20 @@ def r2r_summarize(context) : return @op(ins={"start": In(Nothing)}) -def r2r_upload_summarize(context): - returned_value = post_to_graph("r2r",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) +def SOURCEVAL_upload_summarize(context): + returned_value = post_to_graph("SOURCEVAL",path=SUMMARY_PATH, extension="ttl", graphendpoint=_graphSummaryEndpoint()) r = str('returned value:{}'.format(returned_value)) get_dagster_logger().info(f"upload summary returned {r} ") return #Can we simplify and use just a method. Then import these methods? -# def missingreport_s3(context, msg: str, source="r2r"): +# def missingreport_s3(context, msg: str, source="SOURCEVAL"): # # source= getSitemapSourcesFromGleaner("/scheduler/gleanerconfig.yaml", sourcename=source) # source_url = source.get('url') # s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS, GLEANER_MINIO_PORT), MINIO_OPTIONS) # bucket = GLEANER_MINIO_BUCKET -# source_name="r2r" +# source_name="SOURCEVAL" # # graphendpoint = None # milled = False @@ -784,32 +744,32 @@ def r2r_upload_summarize(context): # r = str('returned value:{}'.format(returned_value)) # return msg + r @graph -def harvest_r2r(): - containers = r2r_getImage() - harvest = r2r_gleaner(start=containers) +def harvest_SOURCEVAL(): + containers = SOURCEVAL_getImage() + harvest = SOURCEVAL_gleaner(start=containers) # defingin nothing dependencies # https://docs.dagster.io/concepts/ops-jobs-graphs/graphs#defining-nothing-dependencies - report_ms3 = r2r_missingreport_s3(start=harvest) - report_idstat = r2r_identifier_stats(start=report_ms3) + report_ms3 = SOURCEVAL_missingreport_s3(start=harvest) + report_idstat = SOURCEVAL_identifier_stats(start=report_ms3) # for some reason, this causes a msg parameter missing - report_bucketurl = r2r_bucket_urls(start=report_idstat) + report_bucketurl = SOURCEVAL_bucket_urls(start=report_idstat) - #report1 = missingreport_s3(harvest, source="r2r") - load_release = r2r_naburelease(start=harvest) - load_uploadrelease = r2r_uploadrelease(start=load_release) + #report1 = missingreport_s3(harvest, source="SOURCEVAL") + load_release = SOURCEVAL_naburelease(start=harvest) + load_uploadrelease = SOURCEVAL_uploadrelease(start=load_release) - load_prune = r2r_nabu_prune(start=load_uploadrelease) - load_prov = r2r_nabuprov(start=load_prune) - load_org = r2r_nabuorg(start=load_prov) + load_prune = SOURCEVAL_nabu_prune(start=load_uploadrelease) + load_prov = SOURCEVAL_nabuprov(start=load_prune) + load_org = SOURCEVAL_nabuorg(start=load_prov) - summarize = r2r_summarize(start=load_uploadrelease) - upload_summarize = r2r_upload_summarize(start=summarize) + summarize = SOURCEVAL_summarize(start=load_uploadrelease) + upload_summarize = SOURCEVAL_upload_summarize(start=summarize) # run after load - report_msgraph = r2r_missingreport_graph(start=summarize) - report_graph = r2r_graph_reports(start=report_msgraph) + report_msgraph = SOURCEVAL_missingreport_graph(start=summarize) + report_graph = SOURCEVAL_graph_reports(start=report_msgraph) diff --git a/dagster/implnets/workflows/ingest/ingest/implnet_sch_SOURCEVAL.py b/dagster/implnets/workflows/ingest/ingest/implnet_sch_SOURCEVAL.py new file mode 100644 index 00000000..232980c5 --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest/implnet_sch_SOURCEVAL.py @@ -0,0 +1,8 @@ +from dagster import schedule + +from jobs.implnet_jobs_SOURCEVAL import implnet_job_SOURCEVAL + +@schedule(cron_schedule="0 24 * * *", job=implnet_job_SOURCEVAL, execution_timezone="US/Central") +def implnet_sch_SOURCEVAL(_context): + run_config = {} + return run_config diff --git a/dagster/implnets/workflows/ingest/ingest/jobs/__init__.py b/dagster/implnets/workflows/ingest/ingest/jobs/__init__.py new file mode 100644 index 00000000..f1a7cbeb --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest/jobs/__init__.py @@ -0,0 +1,5 @@ +from .summon_assets import summon_asset_job, sources_asset_job + +from .tenant_load import tenant_asset_job, tenant_namespaces_job, release_asset_job + +from .summon_assets import sources_partitions_def diff --git a/dagster/implnets/workflows/ingest/ingest/jobs/summon_assets.py b/dagster/implnets/workflows/ingest/ingest/jobs/summon_assets.py new file mode 100644 index 00000000..6901701a --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest/jobs/summon_assets.py @@ -0,0 +1,29 @@ +from dagster import ( + asset, Config, Output,AssetKey, + define_asset_job, AssetSelection, +get_dagster_logger, +) + +from ..assets.gleaner_summon_assets import * +from ..assets.gleaner_sources import sources_partitions_def, gleanerio_sources + +# disabling load_graph report until we can move it to tenant build runs. +summon_asset_job = define_asset_job( + name="summon_and_release_job", + selection=AssetSelection.assets(validate_sitemap_url, gleanerio_run, release_nabu_run, load_report_s3, + release_summarize, identifier_stats, bucket_urls, + graph_stats_report, + #load_report_graph + ), + partitions_def=sources_partitions_def, + #tags={"dagster/concurrency_key": 'ingest'}, +tags={"ingest": 'docker'}, +) +# so can use command line to limit: https://docs.dagster.io/guides/limiting-concurrency-in-data-pipelines#limiting-opasset-concurrency-across-runs +# value is ingest +sources_asset_job = define_asset_job( + name="sources_config_updated_job", + selection=AssetSelection.assets(AssetKey(["ingest","sources_names_active"])).required_multi_asset_neighbors(), + partitions_def=sources_partitions_def, + tags={"dagster/priority": "11"} +) diff --git a/dagster/implnets/workflows/ingest/ingest/jobs/tenant_load.py b/dagster/implnets/workflows/ingest/ingest/jobs/tenant_load.py new file mode 100644 index 00000000..dab6d1ee --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest/jobs/tenant_load.py @@ -0,0 +1,100 @@ +from dagster import ( + op, job, Config, + sensor, RunRequest, RunConfig, + SensorEvaluationContext, asset_sensor, EventLogEntry, + SkipReason, + AssetKey, + static_partitioned_config, dynamic_partitioned_config, DynamicPartitionsDefinition, + define_asset_job, AssetSelection,graph_asset, +BackfillPolicy +) + +from dagster_aws.s3.sensor import get_s3_keys +from typing import List, Dict +from pydantic import Field + +from ..assets import gleanerio_tenants, tenant_partitions_def, sources_partitions_def, upload_release,upload_summary +from ..assets.tenant import create_tenant_containers, create_graph_namespaces +from ..resources.gleanerio import GleanerioResource +from ..resources.gleanerS3 import gleanerS3Resource +from ..resources.graph import BlazegraphResource + + + + +tenant_asset_job = define_asset_job( + name="tenant_config_updated_job", + selection=AssetSelection.assets(AssetKey(["ingest","tenant_names"])).required_multi_asset_neighbors(), + partitions_def=sources_partitions_def, + tags={"dagster/priority": "10"} +) + +release_asset_job = define_asset_job( + name="tenant_release_job", + selection=AssetSelection.assets(upload_release,upload_summary), + partitions_def=sources_partitions_def, + tags={"dagster/priority": "3", "ingest": "graph"} + # tags={"dagster/concurrency_key": 'graph'}, +) +#Attempted to set tag with reserved system prefix: dagster/concurrency_key +#File "/usr/local/lib/python3.11/site-packages/dagster/_daemon/sensor.py", line 471, in _process_tick_generator + +tenant_namespaces_job = define_asset_job( + name="tenant_namespaces_job", + selection=AssetSelection.assets(create_tenant_containers, create_graph_namespaces), + partitions_def=tenant_partitions_def, + tags={"dagster/priority": "20"} +) + +# @job(partitions_def=tenant_partitions_def) +# def tenant_namespaces_job(context): +# source_name = context.asset_partition_key_for_output() +# context.log.info(f"tenant_name {source_name}") +# create_tenant_containers(create_graph_namespaces()) + + +class TenantConfig(Config): + source_name: str + name: str + source_list: List[str] + TENANT_GRAPH_NAMESPACE: str + TENANT_GRAPH_SUMMARY_NAMESPACE: str + SUMMARY_PATH: str = Field( + description="GLEANERIO_GRAPH_SUMMARY_PATH.", default='graphs/summary') + RELEASE_PATH : str = Field( + description="GLEANERIO_GRAPH_RELEASE_PATH.", default='graphs/latest') +@dynamic_partitioned_config(partition_fn=gleanerio_tenants) +def tenant_config(partition_key: str): + + # default_config ={"ops": { + # "upload_release": + # {"config": + # { + # TenantConfig( + # source_name=partition_key, + # name="name", + # source_list=[], + # TENANT_GRAPH_NAMESPACE="", + # TENANT_GRAPH_SUMMARY_NAMESPACE="" + # ) + # } + # } + # }, + # "upload_summary": + # {"config": + # { + # TenantConfig( + # source_name=partition_key, + # name="name", + # source_list=[], + # TENANT_GRAPH_NAMESPACE="", + # TENANT_GRAPH_SUMMARY_NAMESPACE="" + # ) + # } + # } + # } + default_config = {"ops": { + {"upload_release": {"config": {"source_name": partition_key}}}, + {"upload_summary": {"config": {"source_name": partition_key}}} + }} + return default_config diff --git a/dagster/implnets/workflows/ingest/ingest/resources/README.md b/dagster/implnets/workflows/ingest/ingest/resources/README.md new file mode 100644 index 00000000..240a76e0 --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest/resources/README.md @@ -0,0 +1,5 @@ + +Let's try to use dasgeter aws as the minio configuration + +how to handel cacading configs +https://docs.dagster.io/concepts/resources#resources-that-depend-on-other-resources diff --git a/dagster/implnets/workflows/ingest/ingest/resources/__init__.py b/dagster/implnets/workflows/ingest/ingest/resources/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dagster/implnets/workflows/ingest/ingest/resources/gleanerS3.py b/dagster/implnets/workflows/ingest/ingest/resources/gleanerS3.py new file mode 100644 index 00000000..d16a8abb --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest/resources/gleanerS3.py @@ -0,0 +1,87 @@ +from dagster import asset, get_dagster_logger, define_asset_job, ConfigurableResource +from dagster_aws.s3 import S3Resource + +#from dagster import Field +from pydantic import Field + +from ..utils import PythonMinioAddress + + +class gleanerS3Resource(ConfigurableResource): + # this should be s3, since it is the s3 resource. Others at gleaner s3 resources + s3: S3Resource + GLEANERIO_MINIO_BUCKET: str = Field( + description="GLEANERIO_MINIO_BUCKET.") + GLEANERIO_MINIO_ADDRESS: str = Field( + description="GLEANERIO_MINIO_BUCKET.") + GLEANERIO_MINIO_PORT: str = Field( + description="GLEANERIO_MINIO_BUCKET.") + GLEANERIO_MINIO_USE_SSL: bool= Field( + default=False) + GLEANERIO_CONFIG_PATH : str = Field( + description="GLEANERIO_CONFIG_PATH.", default="scheduler/configs/") + GLEANERIO_TENANT_FILENAME : str = Field( + description="GLEANERIO_TENANT_FILENAME.", default="tenant.yaml") + GLEANERIO_SOURCES_FILENAME: str = Field( + description="GLEANERIO_SOURCES_FILENAME.", default="gleanerconfig.yaml") + # now using the boto s3 embedded in dagster_aws, but just in case we need them + GLEANERIO_MINIO_ACCESS_KEY: str = Field( + description="GLEANERIO_MINIO_ACCESS_KEY") + GLEANERIO_MINIO_SECRET_KEY: str = Field( + description="GLEANERIO_MINIO_SECRET_KEY") + + ## https://docs.dagster.io/_apidocs/libraries/dagster-a +# Courtesy method for the ec utilities + def MinioOptions(self): + return {"secure": self.s3.use_ssl + + , "access_key": self.s3.aws_access_key_id + , "secret_key": self.s3.aws_secret_access_key + } +## https://docs.dagster.io/_apidocs/libraries/dagster-aws#s3 +# fields from dagster_aws.s3.S3Resource +# region_name +# endpoint_url +# use_ssl +# aws_access_key_id +# aws_secret_access_key + def listPath(self, path='orgs', recusrsive=True): + result = self.s3.get_client().list_objects( + Bucket=self.GLEANERIO_MINIO_BUCKET, + Prefix=path, +# Recusrsive=recusrsive + ) + return result["Contents"] + def getFile(self, path='test'): + try: + result = self.s3.get_client().get_object( + Bucket=self.GLEANERIO_MINIO_BUCKET, + Key=path, + ) + get_dagster_logger().info( + f"file {result['Body']}" ) + return result["Body"] + except Exception as ex: + get_dagster_logger().info(f"file {path} not found in {self.GLEANERIO_MINIO_BUCKET} at {self.s3.endpoint_url} {ex}") + def getTennatFile(self, path=''): + if path == '': + path= f"{self.GLEANERIO_CONFIG_PATH}{self.GLEANERIO_TENANT_FILENAME}" + try: + get_dagster_logger().info(f"tenant_path {path} ") + return self.getFile( path=path) + + except Exception as ex: + get_dagster_logger().info(f"tenant {path} not found ") + #endpoint_url =_pythonMinioAddress(GLEANER_MINIO_ADDRESS, port=GLEANER_MINIO_PORT) + + # this will change to use just a sources. + def getSourcesFile(self, path=''): + if path == '': + path= f"{self.GLEANERIO_CONFIG_PATH}{self.GLEANERIO_SOURCES_FILENAME}" + try: + get_dagster_logger().info(f"sources_path {path} ") + return self.getFile( path=path) + + except Exception as ex: + get_dagster_logger().info(f"sources_path {path} not found ") + #endpoint_url =_pythonMinioAddress(GLEANER_MINIO_ADDRESS, port=GLEANER_MINIO_PORT) diff --git a/dagster/implnets/workflows/ingest/ingest/resources/gleanerio.py b/dagster/implnets/workflows/ingest/ingest/resources/gleanerio.py new file mode 100644 index 00000000..937d3c0d --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest/resources/gleanerio.py @@ -0,0 +1,497 @@ +import io +import os +from typing import Any, Mapping, Optional, Sequence + +#from dagster import Field +from pydantic import Field + +import pydash +from dagster import ConfigurableResource, Config, EnvVar, get_dagster_logger + + + +import time +from datetime import datetime +import requests + +import docker +from docker.types import RestartPolicy, ServiceMode + +from dagster import In, Nothing, OpExecutionContext, StringSource, op + +from dagster._core.utils import parse_env_var + + +from dagster_docker.container_context import DockerContainerContext +from dagster_docker.docker_run_launcher import DockerRunLauncher +from dagster_docker.utils import DOCKER_CONFIG_SCHEMA, validate_docker_image +from docker.types.services import ContainerSpec, TaskTemplate, ConfigReference + +from .graph import GraphResource,BlazegraphResource +from .gleanerS3 import gleanerS3Resource + +#Let's try to use dasgeter aws as the minio configuration + +# +# # Vars and Envs +# GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") +# # env items +# URL = os.environ.get('PORTAINER_URL') +# APIKEY = os.environ.get('PORTAINER_KEY') +# CONTAINER_WAIT_TIMEOUT= os.environ.get('GLEANERIO_CONTAINER_WAIT_SECONDS', 5) +# +# Let's try to use dasgeter aws as the minio configuration +# GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) +# GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) +# GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) +# GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) +# GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) +# GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) +# +# # set for the earhtcube utiltiies +# MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL +# +# ,"access_key": GLEANER_MINIO_ACCESS_KEY +# ,"secret_key": GLEANER_MINIO_SECRET_KEY +# } +# +# GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) +# # using GLEANER, even though this is a nabu property... same prefix seems easier +# GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) +# GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) +# GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) +# GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) +# GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) +# GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) +# GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner +# GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) +# GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) +# GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) +# GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) +# GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) +# GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) +# #GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') +# GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) +# +# SUMMARY_PATH = 'graphs/summary' +# RELEASE_PATH = 'graphs/latest' + +# this will probably need to handle the client, and the +class GleanerioResource(ConfigurableResource): + + DEBUG_CONTAINER: bool + # docker/portainer API + GLEANERIO_DOCKER_URL: str = Field( + description="Docker Endpoint URL.") + GLEANERIO_PORTAINER_APIKEY: str = Field( + description="Portainer API Key.") + # Dokcerhub container images + GLEANERIO_GLEANER_IMAGE: str = Field( + description="GLEANERIO_GLEANER_IMAGE.") + GLEANERIO_NABU_IMAGE: str = Field( + description="GLEANERIO_NABU_IMAGE.") + + # docker swarm resources. Presently a network and config names + GLEANERIO_DOCKER_HEADLESS_NETWORK: str = Field( + description="GLEANERIO_HEADLESS_NETWORK.") + GLEANERIO_DOCKER_GLEANER_CONFIG: str = Field( + description="GLEANERIO_DOCKER_GLEANER_CONFIG.") + GLEANERIO_DOCKER_NABU_CONFIG: str = Field( + description="GLEANERIO_DOCKER_NABU_CONFIG.") + + GLEANERIO_HEADLESS_ENDPOINT:str = Field( + description="GLEANERIO_HEADLESS_NETWORK.", default="http://headless:9000/") + +# location where config file will be mounted in container + GLEANERIO_GLEANER_CONFIG_PATH: str = Field( + description="GLEANERIO_DOCKER_GLEANER_CONFIG_PATH.") + + GLEANERIO_NABU_CONFIG_PATH: str = Field( + description="GLEANERIO_DOCKER_NABU_CONFIG_PATH.") + +# Execution parameter. The logs from LOG_PREFIX will be uploaded to s3 every n seconds. + GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT: int = Field( + description="CONTAINER_WAIT_TIMEOUT.", default=600) + GLEANERIO_LOG_PREFIX: str = Field( + description="GLEANERIO_DOCKER_LOG_PREFIX.") + + GLEANERIO_DAGSTER_CONFIG_PATH: str = Field( + description="DAGSTER_GLEANERIO_CONFIG_PATH for Project.") + gs3: gleanerS3Resource # this will be a botocore.client.S3. + triplestore: GraphResource # should be a blazegraph... but let's try generic + GLEANERIO_GRAPH_NAMESPACE:str = Field( + description="GLEANERIO_GRAPH_NAMESPACE for Project.") + GLEANERIO_GRAPH_SUMMARY_NAMESPACE:str = Field( + description="GLEANERIO_GRAPH_SUMMARY_NAMESPACE for Project.") + + # at present, these are hard coded as os.getenv in sensors.gleaner_summon.sources_schedule + GLEANERIO_SCHEDULE_DEFAULT :str = Field( + description="GLEANERIO_SCHEDULE_DEFAULT for Project.", default="@weekly") + GLEANERIO_SCHEDULE_DEFAULT_TIMEZONE :str = Field( + description="GLEANERIO_SCHEDULE_DEFAULT_TIMEZONE for Project.", default="America/Los_Angeles") + + def _get_client(self, docker_container_context: DockerContainerContext): + headers = {'X-API-Key': self.GLEANERIO_PORTAINER_APIKEY} + client = docker.DockerClient(base_url=self.GLEANERIO_DOCKER_URL, version="1.43") + # client = docker.APIClient(base_url=URL, version="1.35") + get_dagster_logger().info(f"create docker client") + if (client.api._general_configs): + client.api._general_configs["HttpHeaders"] = headers + else: + client.api._general_configs = {"HttpHeaders": headers} + client.api.headers['X-API-Key'] = self.GLEANERIO_PORTAINER_APIKEY + get_dagster_logger().info(f" docker version {client.version()}") + if docker_container_context.registry: + client.login( + registry=docker_container_context.registry["url"], + username=docker_container_context.registry["username"], + password=docker_container_context.registry["password"], + ) + return client + + def _create_service(self, + op_context: OpExecutionContext, + client, + container_context: DockerContainerContext, + image: str, + entrypoint: Optional[Sequence[str]], + command: Optional[Sequence[str]], + name="", + workingdir="/", + + ): + env_vars = dict([parse_env_var(env_var) for env_var in container_context.env_vars]) + get_dagster_logger().info(f"create docker service for {name}") + ## thoguhts + # return service, container, since there is one + restart_policy = RestartPolicy(condition='none') + # docker.py if replicated job, total completions = replicas + # replicas =0 you do not get a container + serivce_mode = ServiceMode("replicated-job", concurrency=1, replicas=1) + get_dagster_logger().info(str(client.configs.list())) + # gleanerid = client.configs.list(filters={"name":{"gleaner-eco": "true"}}) + gleanerconfig = client.configs.list(filters={"name": [self.GLEANERIO_DOCKER_GLEANER_CONFIG]}) + if gleanerconfig is not None and len(gleanerconfig ) >0: + get_dagster_logger().info(f"docker config gleaner id {str(gleanerconfig[0].id)}") + else: + raise Exception(f"docker config '{self.GLEANERIO_DOCKER_GLEANER_CONFIG}' not found. Please add Gleaner/Nabu configuration files to docker.") + nabuconfig = client.configs.list(filters={"name": [self.GLEANERIO_DOCKER_NABU_CONFIG]}) + if nabuconfig is not None and len(nabuconfig) >0 : + get_dagster_logger().info(f"docker config nabu id {str(nabuconfig[0].id)}") + else: + raise Exception(f"docker config '{self.GLEANERIO_DOCKER_NABU_CONFIG}' not found. Please add Gleaner/Nabu configuration files to docker.") + get_dagster_logger().info(f"create docker service for {name}") + gleaner = ConfigReference(gleanerconfig[0].id, self.GLEANERIO_DOCKER_GLEANER_CONFIG, self.GLEANERIO_GLEANER_CONFIG_PATH) + nabu = ConfigReference(nabuconfig[0].id, self.GLEANERIO_DOCKER_NABU_CONFIG, self.GLEANERIO_NABU_CONFIG_PATH) + configs = [gleaner, nabu] + # name = name if len(name) else _get_container_name(op_context.run_id, op_context.op.name, op_context.retry_number), + service = client.services.create( + image, + args=command, + env=env_vars, + name=name, + networks=container_context.networks if len(container_context.networks) else None, + restart_policy=restart_policy, + mode=serivce_mode, + workdir=workingdir, + configs=configs + ) + wait_count = 0 + while True: + time.sleep(1) + wait_count += 1 + get_dagster_logger().debug(str(service.tasks())) + + container_task = service.tasks(filters={"service": name}) + + containers = client.containers.list(all=True, filters={"label": f"com.docker.swarm.service.name={name}"}) + if len(containers) > 0: + break + if wait_count > 12: + raise f"Container for service {name} not starting" + + get_dagster_logger().info(len(containers)) + return service, containers[0] + + def getImage(self,context): + run_container_context = DockerContainerContext.create_for_run( + context.dagster_run, + context.instance.run_launcher + if isinstance(context.instance.run_launcher, DockerRunLauncher) + else None, + ) + get_dagster_logger().info(f"call docker _get_client: ") + client = self.get_client(run_container_context) + client.images.pull(self.GLEANERIO_GLEANER_IMAGE) + client.images.pull(self.GLEANERIO_NABU_IMAGE) + + def s3loader(self,data, name, date_string=datetime.now().strftime("%Y_%m_%d_%H_%M_%S")): + logname = name + '_{}.log'.format(date_string) + objPrefix = self.GLEANERIO_LOG_PREFIX + logname + f = io.BytesIO() + # length = f.write(bytes(json_str, 'utf-8')) + length = f.write(data) + f.seek(0) + self.gs3.s3.get_client().put_object(Bucket=self.gs3.GLEANERIO_MINIO_BUCKET, + Key=objPrefix, + Body=f, # io.BytesIO(data), + ContentLength=length, # len(data), + ContentType="text/plain" + ) + get_dagster_logger().info(f"Log uploaded: {str(objPrefix)}") +# rewrite so that we pass in the image, args, name working dir. + # we want to setup 'sensors' for when assets are returned by these + # data -> returns summon directory, and a release file. + + def execute(self,context, mode, source): + ## ------------ Create + returnCode = 0 + get_dagster_logger().info(f"Gleanerio mode: {str(mode)}") + date_string = datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + if str(mode) == "gleaner": + IMAGE =self.GLEANERIO_GLEANER_IMAGE + + # ARGS = f"gleaner --cfg/gleaner/gleanerconfig.yaml -source {source} --rude" + ARGS = ["--cfg", self.GLEANERIO_GLEANER_CONFIG_PATH,"-source", source, "--rude"] + NAME = f"sch_{source}_{str(mode)}" + WorkingDir = "/gleaner/" + #Entrypoint = ["/gleaner/gleaner", "--cfg", "/gleaner/gleanerconfig.yaml", "-source", source, "--rude"] + # LOGFILE = 'log_gleaner.txt' # only used for local log file writing + elif (str(mode) == "prune"): + IMAGE = self.GLEANERIO_NABU_IMAGE + + ARGS = ["--cfg", self.GLEANERIO_NABU_CONFIG_PATH, "prune", "--prefix", "summoned/" + source] + NAME = f"sch_{source}_{str(mode)}" + WorkingDir = "/nabu/" + Entrypoint = "nabu" + # LOGFILE = 'log_nabu.txt' # only used for local log file writing + elif (str(mode) == "prov"): + IMAGE = self.GLEANERIO_NABU_IMAGE + + ARGS = ["--cfg", self.GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "prov/" + source] + NAME = f"sch_{source}_{str(mode)}" + WorkingDir = "/nabu/" + Entrypoint = "nabu" + # LOGFILE = 'log_nabu.txt' # only used for local log file writing + elif (str(mode) == "orgs"): + IMAGE = self.GLEANERIO_NABU_IMAGE + + ARGS = ["--cfg", self.GLEANERIO_NABU_CONFIG_PATH, "prefix", "--prefix", "orgs"] + NAME = f"sch_{source}_{str(mode)}" + WorkingDir = "/nabu/" + Entrypoint = "nabu" + # LOGFILE = 'log_nabu.txt' # only used for local log file writing + elif (str(mode) == "release"): + IMAGE = self.GLEANERIO_NABU_IMAGE + + ARGS = ["--cfg", self.GLEANERIO_NABU_CONFIG_PATH, "release", "--prefix", "summoned/" + source] + NAME = f"sch_{source}_{str(mode)}" + WorkingDir = "/nabu/" + Entrypoint = "nabu" + # LOGFILE = 'log_nabu.txt' # only used for local log file writing + else: + + returnCode = 1 + return returnCode + + # from docker0dagster + run_container_context = DockerContainerContext.create_for_run( + context.dagster_run, + context.instance.run_launcher + if isinstance(context.instance.run_launcher, DockerRunLauncher) + else None, + ) + validate_docker_image(IMAGE) + + try: + # setup data/body for container create + data = {} + data["Image"] = IMAGE + data["WorkingDir"] = WorkingDir + #data["Entrypoint"] = Entrypoint + data["Cmd"] = ARGS + #### gleaner + # v.BindEnv("minio.address", "MINIO_ADDRESS") + # v.BindEnv("minio.port", "MINIO_PORT") + # v.BindEnv("minio.ssl", "MINIO_USE_SSL") + # v.BindEnv("minio.accesskey", "MINIO_ACCESS_KEY") + # v.BindEnv("minio.secretkey", "MINIO_SECRET_KEY") + # v.BindEnv("minio.bucket", "MINIO_BUCKET") + # // v.BindEnv("minio.region", "MINIO_REGION") + # v.BindEnv("sparql.endpoint", "SPARQL_ENDPOINT") + # v.BindEnv("sparql.authenticate", "SPARQL_AUTHENTICATE") + # v.BindEnv("sparql.username", "SPARQL_USERNAME") + # v.BindEnv("sparql.password", "SPARQL_PASSWORD") + # v.BindEnv("s3.domain", "S3_DOMAIN") + ### gleaner summoner config + # viperSubtree.BindEnv("headless", "GLEANER_HEADLESS_ENDPOINT") + # viperSubtree.BindEnv("threads", "GLEANER_THREADS") + # viperSubtree.BindEnv("mode", "GLEANER_MODE") + + #### NABU config + # minioSubtress.BindEnv("address", "MINIO_ADDRESS") + # minioSubtress.BindEnv("port", "MINIO_PORT") + # minioSubtress.BindEnv("ssl", "MINIO_USE_SSL") + # minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY") + # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") + # minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY") + # minioSubtress.BindEnv("bucket", "MINIO_BUCKET") + # viperSubtree.BindEnv("endpoint", "SPARQL_ENDPOINT") + ###### nabu sparql config + # viperSubtree.BindEnv("endpointBulk", "SPARQL_ENDPOINTBULK") + # viperSubtree.BindEnv("endpointMethod", "SPARQL_ENDPOINTMETHOD") + # viperSubtree.BindEnv("contentType", "SPARQL_CONTENTTYPE") + # viperSubtree.BindEnv("authenticate", "SPARQL_AUTHENTICATE") + # viperSubtree.BindEnv("username", "SPARQL_USERNAME") + # viperSubtree.BindEnv("password", "SPARQL_PASSWORD") + ### NABU object + # viperSubtree.BindEnv("bucket", "MINIO_BUCKET") + # viperSubtree.BindEnv("domain", "S3_DOMAIN") + # add in env variables here"Env": ["FOO=bar","BAZ=quux"], + + # TODO: Build SPARQL_ENDPOINT from GLEANER_GRAPH_URL, GLEANER_GRAPH_NAMESPACE + enva = [] + enva.append(str("MINIO_ADDRESS={}".format(self.gs3.GLEANERIO_MINIO_ADDRESS))) # the python needs to be wrapped, this does not + enva.append(str("MINIO_PORT={}".format(self.gs3.GLEANERIO_MINIO_PORT))) + #enva.append(str("MINIO_USE_SSL={}".format(self.gs3.GLEANER_MINIO_USE_SSL))) + enva.append(str("MINIO_USE_SSL={}".format(self.gs3.s3.use_ssl))) + #enva.append(str("MINIO_SECRET_KEY={}".format(self.gs3.GLEANER_MINIO_SECRET_KEY))) + #enva.append(str("MINIO_ACCESS_KEY={}".format(self.gs3.GLEANER_MINIO_ACCESS_KEY))) + enva.append(str("MINIO_SECRET_KEY={}".format(self.gs3.s3.aws_secret_access_key))) + enva.append(str("MINIO_ACCESS_KEY={}".format(self.gs3.s3.aws_access_key_id))) + #enva.append(str("MINIO_BUCKET={}".format(self.gs3.GLEANER_MINIO_BUCKET))) + enva.append(str("MINIO_BUCKET={}".format(self.gs3.GLEANERIO_MINIO_BUCKET))) + enva.append(str("SPARQL_ENDPOINT={}".format(self.triplestore.GraphEndpoint(self.GLEANERIO_GRAPH_NAMESPACE)))) + enva.append(str("GLEANER_HEADLESS_ENDPOINT={}".format(self.GLEANERIO_HEADLESS_ENDPOINT))) + enva.append(str("GLEANERIO_DOCKER_HEADLESS_NETWORK={}".format(self.GLEANERIO_DOCKER_HEADLESS_NETWORK))) + + data["Env"] = enva + data["HostConfig"] = { + "NetworkMode": self.GLEANERIO_DOCKER_HEADLESS_NETWORK, + } + + + # docker dagster + get_dagster_logger().info(f"start docker code region: ") + + + # trying to get headers in: + # https://github.com/docker/docker-py/blob/84414e343e526cf93f285284dd2c2c40f703e4a9/docker/utils/decorators.py#L45 + op_container_context = DockerContainerContext( + # registry=registry, + env_vars=enva, + networks=[self.GLEANERIO_DOCKER_HEADLESS_NETWORK], + container_kwargs={"working_dir": data["WorkingDir"], + # "volumes": { + # f"{GLEANER_CONFIG_VOLUME}": + # {'bind': '/configs', 'mode': 'rw'} + # }, + + + }, + ) + container_context = run_container_context.merge(op_container_context) + get_dagster_logger().info(f"call docker _get_client: ") + client = self._get_client(container_context) + + try: + get_dagster_logger().info(f"try docker _create_service: ") + service, container = self._create_service( + context, client, container_context, IMAGE, "", data["Cmd"], name=NAME, + workingdir=data["WorkingDir"] + ) + except Exception as err: + raise err + + + cid = container.id # legacy til the start get's fixed + + + # Removed watching the logs, in favor of periodic upload + wait_count = 0 + while True: + wait_count += 1 + try: + container.wait(timeout=self.GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT) + exit_status = container.wait()["StatusCode"] + get_dagster_logger().info(f"Container Wait Exit status: {exit_status}") + # WE PULL THE LOGS, then will throw an error + returnCode = exit_status + c = container.logs(stdout=True, stderr=True, stream=False, follow=False).decode('latin-1') + + # write to s3 + # use minio_resource + + self.s3loader(str(c).encode(), NAME, date_string=date_string) # s3loader needs a bytes like object + + # s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object + # write to minio (would need the minio info here) + + get_dagster_logger().info(f"container Logs to s3: ") + # this needs to be address at some point. https://www.appsloveworld.com/docker/100/85/docker-py-getarchive-destination-folder + path = f"{WorkingDir}/logs" + tar_archive_stream, tar_stat = container.get_archive(path) + archive = bytearray() + for chunk in tar_archive_stream: + archive.extend(chunk) + # use minio_resource + self.s3loader(archive, f"{source}_{mode}_runlogs", date_string=date_string) + get_dagster_logger().info(f"uploaded logs : {source}_{mode}_runlogs to {path}") + break + except requests.exceptions.ReadTimeout as ex: + path = f"{WorkingDir}/logs" + tar_archive_stream, tar_stat = container.get_archive(path) + archive = bytearray() + for chunk in tar_archive_stream: + archive.extend(chunk) + # use minio_resource + self.s3loader(archive, f"{source}_{mode}_runlogs", date_string=date_string) + get_dagster_logger().info(f"uploaded {wait_count}th log : {source}_{mode}_runlogs to {path}") + except docker.errors.APIError as ex: + get_dagster_logger().info(f"Container Wait docker API error : {str(ex)}") + returnCode = 1 + break + if container.status == 'exited' or container.status == 'removed': + get_dagster_logger().info(f"Container exited or removed. status: {container.status}") + exit_status = container.wait()["StatusCode"] + returnCode = exit_status + # use minio_resource + self.s3loader(str(c).encode(), NAME) # s3loader needs a bytes like object + # s3loader(str(c).encode('utf-8'), NAME) # s3loader needs a bytes like object + # write to minio (would need the minio info here) + + get_dagster_logger().info(f"container Logs to s3: ") + # this needs to be address at some point. https://www.appsloveworld.com/docker/100/85/docker-py-getarchive-destination-folder + path = f"{WorkingDir}/logs" + tar_archive_stream, tar_stat = container.get_archive(path) + archive = bytearray() + for chunk in tar_archive_stream: + archive.extend(chunk) + # use minio_resource + self.s3loader(archive, f"{source}_{mode}_runlogs", date_string=date_string) + get_dagster_logger().info(f"uploaded logs : {source}_{mode}_runlogs to {path}") + break + + # ABOVE Future, need to extraxct files, and upload + # pw_tar = tarfile.TarFile(fileobj=StringIO(d.decode('utf-8'))) + # pw_tar.extractall("extract_to/") + + + if exit_status != 0: + raise Exception(f"Gleaner/Nabu container returned exit code {exit_status}") + finally: + if (not self.DEBUG_CONTAINER) : + if (service): + service.remove() + get_dagster_logger().info(f"Service Remove: {service.name}") + else: + get_dagster_logger().info(f"Service Not created, so not removed.") + + else: + get_dagster_logger().info(f"Service {service.name} NOT Removed : DEBUG ENABLED") + + + if (returnCode != 0): + get_dagster_logger().info(f"Gleaner/Nabu container non-zero exit code. See logs in S3") + raise Exception("Gleaner/Nabu container non-zero exit code. See logs in S3") + return returnCode diff --git a/dagster/implnets/workflows/ingest/ingest/resources/graph.py b/dagster/implnets/workflows/ingest/ingest/resources/graph.py new file mode 100644 index 00000000..e2636be3 --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest/resources/graph.py @@ -0,0 +1,128 @@ +import os +from typing import Any, Dict + +import pydash +from dagster import ConfigurableResource, Config, EnvVar, get_dagster_logger + +#from dagster import Field +from pydantic import Field +import requests + +from .gleanerS3 import gleanerS3Resource +#Let's try to use dasgeter aws as the minio configuration +from ..utils import PythonMinioAddress + +# class AirtableConfig(Config): +# DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") +# +# # Vars and Envs +# GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") +# # env items +# URL = os.environ.get('PORTAINER_URL') +# APIKEY = os.environ.get('PORTAINER_KEY') +# CONTAINER_WAIT_TIMEOUT= os.environ.get('GLEANERIO_CONTAINER_WAIT_SECONDS', 5) +# +# Let's try to use dasgeter aws as the minio configuration +# GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) +# GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) +# GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) +# GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) +# GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) +# GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) +# +# # set for the earhtcube utiltiies +# MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL +# +# ,"access_key": GLEANER_MINIO_ACCESS_KEY +# ,"secret_key": GLEANER_MINIO_SECRET_KEY +# } +# +# GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) +# # using GLEANER, even though this is a nabu property... same prefix seems easier +# GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) +# GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) +# GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) +# GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) +# GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) +# GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) +# GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner +# GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) +# GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) +# GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) +# GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) +# GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) +# GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) +# #GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') +# GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) +# +# SUMMARY_PATH = 'graphs/summary' +# RELEASE_PATH = 'graphs/latest' + + +class GraphResource(ConfigurableResource): + GLEANERIO_GRAPH_URL: str = Field( + description="GLEANERIO_GRAPH_URL.") + GLEANERIO_GRAPH_NAMESPACE: str = Field( + description="GLEANERIO_GRAPH_NAMESPACE.") + gs3: gleanerS3Resource + +# need multiple namespaces. let's do this. + def GraphEndpoint(self, namespace): + url = f"{self.GLEANERIO_GRAPH_URL}/namespace/{namespace}/sparql" + return url + + + def post_to_graph(self, source, path='graphs/latest', extension="nq", graphendpoint=None): + if graphendpoint is None: + graphendpoint = self.GraphEndpoint() + # revision of EC utilities, will have a insertFromURL + #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) + proto = "http" +# this need to get file from s3. + + if self.gs3.GLEANERIO_MINIO_USE_SSL: + proto = "https" + port = self.gs3.GLEANERIO_MINIO_PORT + address = PythonMinioAddress(self.gs3.GLEANERIO_MINIO_ADDRESS, self.gs3.GLEANERIO_MINIO_PORT) + bucket = self.gs3.GLEANERIO_MINIO_BUCKET + release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" + # BLAZEGRAPH SPECIFIC + # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" + # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') + # r = requests.post(url) + # log.debug(f' status:{r.status_code}') # status:404 + # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') + # if r.status_code == 200: + # # '' + # if 'data modified="0"' in r.text: + # get_dagster_logger().info(f'graph: no data inserted ') + # raise Exception("No Data Added: " + r.text) + # return True + # else: + # get_dagster_logger().info(f'graph: error') + # raise Exception(f' graph: insert failed: status:{r.status_code}') + + ### GENERIC LOAD FROM + url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" + get_dagster_logger().info(f'graph: insert "{source}" to {url} ') + loadfrom = {'update': f'LOAD <{release_url}>'} + headers = { + 'Content-Type': 'application/x-www-form-urlencoded' + } + r = requests.post(url, headers=headers, data=loadfrom ) + get_dagster_logger().debug(f' status:{r.status_code}') # status:404 + get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') + if r.status_code == 200: + get_dagster_logger().info(f'graph load response: {str(r.text)} ') + # '' + if 'mutationCount=0' in r.text: + get_dagster_logger().info(f'graph: no data inserted ') + #raise Exception("No Data Added: " + r.text) + return True + else: + get_dagster_logger().info(f'graph: error {str(r.text)}') + raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') + +class BlazegraphResource(GraphResource): + pass + diff --git a/dagster/implnets/workflows/ingest/ingest/sensors/__init__.py b/dagster/implnets/workflows/ingest/ingest/sensors/__init__.py new file mode 100644 index 00000000..68ec81b4 --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest/sensors/__init__.py @@ -0,0 +1,5 @@ + +from .load_on_release_sensor import release_file_sensor, release_file_sensor_v2 +from .gleaner_summon import sources_sensor, sources_schedule +from .tenant_sensor import tenant_names_sensor, tenant_names_sensor_v2 +from .s3_configs_sensor import sources_s3_sensor, tenant_s3_sensor diff --git a/dagster/implnets/workflows/ingest/ingest/sensors/gleaner_summon.py b/dagster/implnets/workflows/ingest/ingest/sensors/gleaner_summon.py new file mode 100644 index 00000000..27b49a87 --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest/sensors/gleaner_summon.py @@ -0,0 +1,105 @@ +import os + +import dagster +from dagster import ( + SensorResult, RunRequest, + EventLogEntry, AssetKey, asset_sensor, + schedule,ScheduleDefinition,DefaultSensorStatus,DefaultScheduleStatus, +get_dagster_logger +) +from ..assets import ( + sources_partitions_def +) +from ..jobs.summon_assets import summon_asset_job + + +# this monitors the asset. It will harvest a new source +# the sources_schedule_sensor will add to the weekly schedule + +# note on removal of partitions https://github.com/dagster-io/dagster/issues/14026 +@asset_sensor(default_status=DefaultSensorStatus.RUNNING, asset_key=AssetKey(["ingest","sources_names_active"]), job=summon_asset_job + # , minimum_interval_seconds=600 + ) +def sources_sensor(context, asset_event: EventLogEntry): + context.log.info(f"sources_sensor: start") + assert asset_event.dagster_event and asset_event.dagster_event.asset_key + context.log.info(f"asset_key {asset_event.dagster_event.asset_key}") +# well this is a pain. but it works. Cannot just pass it like you do in ops + # otherwise it's just an AssetDefinition. + sources = context.repository_def.load_asset_value(AssetKey(["ingest","sources_names_active"])) + new_sources = [ + source + for source in sources + if not sources_partitions_def.has_partition_key( + source, dynamic_partitions_store=context.instance + ) + ] + removed_sources = [ + source + for source in sources_partitions_def.get_partition_keys(dynamic_partitions_store=context.instance) + if not source in sources + ] + for s in removed_sources: + context.instance.delete_dynamic_partition("sources_names_active", s) + context.log.info(f"new sources {new_sources}") + context.log.info(f"Removed {removed_sources}") + return SensorResult( + run_requests=[ + RunRequest(partition_key=source + # , job_name=f"{source}_load" + , run_key=f"{source}_load" + ) for source in new_sources + ], + dynamic_partitions_requests=[ + sources_partitions_def.build_add_request(new_sources) + ], + ) +# +# https://docs.dagster.io/concepts/partitions-schedules-sensors/schedules#static-partitioned-jobs +# humm https://github.com/dagster-io/dagster/blob/567cb59f1da819bbb8522108fc2c2a3bace6c7b3/python_modules/dagster-test/dagster_test/toys/schedules.py#L41 + +# # so this needs to be a schedule, and we handle the cron by ourselves.) +sched = os.environ.get("GLEANERIO_DEFAULT_SCHEDULE", "@weekly") +sched_timezone = os.environ.get("GLEANERIO_DEFAULT_SCHEDULE_TIMEZONE", "America/Los_Angeles") +@schedule(job=summon_asset_job, cron_schedule=sched,execution_timezone=sched_timezone, + default_status=DefaultScheduleStatus.RUNNING, + ) +def sources_schedule(context): + partition_keys = sources_partitions_def.get_partition_keys(dynamic_partitions_store=context.instance) + get_dagster_logger().info(str(partition_keys)) + return [ + RunRequest( + partition_key=partition_key, + # run_key=f"{context.scheduled_execution_time}_{partition_key}" + run_key=f"summon_asset_{partition_key}" + ) + for partition_key in partition_keys + ] + + + +# from dagster import sensor, RunRequest, SensorExecutionContext +# from dagster import (DynamicPartitionsDefinition, job) +# # Define your dynamic partitions +# fruits = DynamicPartitionsDefinition(name="fruits") +# # Define a job that will process the partitions +# @job() +# def my_job(): +# # Your job logic here +# pass +# # Define a sensor that triggers the job and updates the partitions +# @sensor(job=my_job) +# def my_sensor(context: SensorExecutionContext): +# # Logic to determine if there are new partitions to add +# # For example, check a directory for new files, query a database, etc. +# new_partitions = ["apple", "banana"] +# # Replace with your dynamic logic +# # Build add requests for the new partitions +# dynamic_partitions_requests = [fruits.build_add_request(new_partitions)] +# # Create a run request for each new partition +# run_requests = [RunRequest(partition_key=partition) for partition in new_partitions] +# # Return the sensor result with run requests and dynamic partition requests +# return SensorResult( +# run_requests=run_requests, +# dynamic_partitions_requests=dynamic_partitions_requests +# ) diff --git a/dagster/implnets/workflows/ingest/ingest/sensors/harvest_sched.py b/dagster/implnets/workflows/ingest/ingest/sensors/harvest_sched.py new file mode 100644 index 00000000..e69de29b diff --git a/dagster/implnets/workflows/ingest/ingest/sensors/load_on_release_sensor.py b/dagster/implnets/workflows/ingest/ingest/sensors/load_on_release_sensor.py new file mode 100644 index 00000000..17ab3b5a --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest/sensors/load_on_release_sensor.py @@ -0,0 +1,129 @@ +from dagster import ( +op, job, Config,get_dagster_logger,DefaultSensorStatus, +sensor, RunRequest, RunConfig, +SensorEvaluationContext, +SkipReason, +AssetKey, +static_partitioned_config, +asset_sensor, multi_asset_sensor, +EventLogEntry +) +from dagster_aws.s3.sensor import get_s3_keys +from typing import List, Dict +from pydantic import Field + +from ..resources.gleanerio import GleanerioResource +from ..resources.gleanerS3 import gleanerS3Resource +from ..resources.graph import BlazegraphResource +from ..assets import tenant_partitions_def,TenantConfig +from ..jobs.tenant_load import release_asset_job, create_graph_namespaces +from ..assets.gleaner_summon_assets import RELEASE_PATH, SUMMARY_PATH + +#from ..jobs.tennant_load import build_community +# This sensor needs to detect when an source has completed its' run +# and then load the data into the client's graphstore. + + + +# ####### +# Put the config for a tennant at the job level so we only have to define it once +###### + + + + + +#@sensor(job=build_community,minimum_interval_seconds=60) + +# https://docs.dagster.io/concepts/partitions-schedules-sensors/sensors#using-resources-in-sensors +# sensor factor example +# https://github.com/dagster-io/dagster/blob/master/examples/project_fully_featured/project_fully_featured/sensors/hn_tables_updated_sensor.py +###### +# https://docs.dagster.io/concepts/partitions-schedules-sensors/asset-sensors#when-all-partitions-have-new-materializations +######## + +# @asset_sensor(asset_key=AssetKey(["ingest","release_summarize"]), +# default_status=DefaultSensorStatus.RUNNING, +# job=release_asset_job, required_resource_keys={"gleanerio"}, +# # minimum_interval_seconds=3600 +# ) +@multi_asset_sensor( + monitored_assets=[ + AssetKey(["ingest","release_summarize"]) + ], + job=release_asset_job, + required_resource_keys={"gleanerio"} +) +def release_file_sensor_v2(context + #,asset_event: EventLogEntry + ): + # assert asset_event.dagster_event and asset_event.dagster_event.asset_key + + run_requests = [] + # source_name = asset_event.dagster_event.partition + # source_key= asset_event.dagster_event.asset_key + # context.log.info(f"partition_key: {source_name} source_key: {source_key}") + + gleaner_resource = context.resources.gleanerio + s3_resource = context.resources.gleanerio.gs3.s3 + gleaner_s3 = context.resources.gleanerio.gs3 + triplestore = context.resources.gleanerio.triplestore + since_key = context.cursor or None + context.log.info(f"sinceKey: {since_key}") + + # run_requests = [RunRequest( + # partition_key=source_name, + # run_key=f"{source_name}_upload_release_{since_key}", + # run_config={})] + # #context.update_cursor(since_key+1) + # context.update_cursor(since_key) + # context.log.info(f"sinceKey new: {context.cursor}") + # return run_requests + for ( + partition, + materializations_by_asset, + ) in context.latest_materialization_records_by_partition_and_asset().items(): + if set(materializations_by_asset.keys()) == set(context.asset_keys): + run_requests.append(RunRequest(partition_key=partition, + run_key=f"{partition}_upload_release_{since_key}",) + ) + for asset_key, materialization in materializations_by_asset.items(): + context.advance_cursor({asset_key: materialization}) + return run_requests +@asset_sensor(asset_key=AssetKey(["ingest","release_summarize"]), + # default_status=DefaultSensorStatus.RUNNING, + job=release_asset_job, required_resource_keys={"gleanerio"}, + # minimum_interval_seconds=3600 + ) +def release_file_sensor(context,config: TenantConfig + ): + gleaner_resource = context.resources.gleanerio + s3_resource = context.resources.gleanerio.gs3.s3 + gleaner_s3 = context.resources.gleanerio.gs3 + triplestore = context.resources.gleanerio.triplestore + since_key = context.cursor or None + context.log.info(f"sinceKey: {since_key}") + #new_s3_keys = get_s3_keys(gleaner_s3.GLEANERIO_MINIO_BUCKET, prefix=SUMMARY_PATH, since_key=since_key) + if since_key is None: + new_s3_keys = s3_resource.get_client().list_objects_v2( + Bucket=gleaner_s3.GLEANERIO_MINIO_BUCKET, + Prefix=SUMMARY_PATH + ) + else: + new_s3_keys = s3_resource.get_client().list_objects_v2( + Bucket=gleaner_s3.GLEANERIO_MINIO_BUCKET, + Prefix=SUMMARY_PATH, + StartAfter=since_key + ) + new_s3_keys = list(new_s3_keys) + context.log.info(f"keys: {new_s3_keys}") + if not new_s3_keys: + return SkipReason(f"No new s3 files found for bucket {gleaner_s3.GLEANERIO_MINIO_BUCKET}.") + context.log.info(f"new key len: {len(new_s3_keys)}") + last_key = new_s3_keys[-1] + + run_requests = [RunRequest(run_key=s3_key, run_config={}) for s3_key in new_s3_keys] + context.update_cursor(last_key) + #context.update_cursor() + context.log.info(f"new sinceKey: {context.cursor}") + return run_requests diff --git a/dagster/implnets/workflows/ingest/ingest/sensors/s3_configs_sensor.py b/dagster/implnets/workflows/ingest/ingest/sensors/s3_configs_sensor.py new file mode 100644 index 00000000..3ad2d07d --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest/sensors/s3_configs_sensor.py @@ -0,0 +1,134 @@ +from dagster import ( +op, job, Config,get_dagster_logger, +sensor, RunRequest, RunConfig, +SensorEvaluationContext,asset_sensor, EventLogEntry, +SkipReason, +AssetKey, +static_partitioned_config, +DefaultSensorStatus +) +from dagster_aws.s3.sensor import get_s3_keys +from typing import List, Dict +from pydantic import Field + +from ..resources.gleanerio import GleanerioResource +from ..resources.gleanerS3 import gleanerS3Resource +from ..resources.graph import BlazegraphResource +from ..assets import tenant_partitions_def,TenantConfig +from ..jobs.tenant_load import (release_asset_job, create_graph_namespaces, tenant_asset_job) +from ..jobs.summon_assets import sources_asset_job +from ..assets.gleaner_summon_assets import RELEASE_PATH, SUMMARY_PATH + +#from ..jobs.tennant_load import build_community +# This sensor needs to detect when an source has completed its' run +# and then load the data into the client's graphstore. + + + +# ####### +# Put the config for a tennant at the job level so we only have to define it once +###### + + + + + +#@sensor(job=build_community,minimum_interval_seconds=60) + +# https://docs.dagster.io/concepts/partitions-schedules-sensors/sensors#using-resources-in-sensors +# sensor factor example +# https://github.com/dagster-io/dagster/blob/master/examples/project_fully_featured/project_fully_featured/sensors/hn_tables_updated_sensor.py +###### +# https://docs.dagster.io/concepts/partitions-schedules-sensors/asset-sensors#when-all-partitions-have-new-materializations +######## +@sensor(name="s3_config_source_sensor", + default_status=DefaultSensorStatus.RUNNING, + #, job_name="sources_updated_job", + job=sources_asset_job, + required_resource_keys={"gleanerio"}, + # minimum_interval_seconds=3600 + ) +def sources_s3_sensor(context + ): + gleaner_resource = context.resources.gleanerio + s3_resource = context.resources.gleanerio.gs3.s3 + gleaner_s3 = context.resources.gleanerio.gs3 + triplestore = context.resources.gleanerio.triplestore + since_key = context.cursor or None + get_dagster_logger().info(f"sinceKey: {since_key}") + config_path=f"{gleaner_s3.GLEANERIO_CONFIG_PATH}" + filename = f"{gleaner_s3.GLEANERIO_CONFIG_PATH}{gleaner_s3.GLEANERIO_SOURCES_FILENAME}" + + new_s3_keys = s3_resource.get_client().head_object( + Bucket=gleaner_s3.GLEANERIO_MINIO_BUCKET, + Key=filename, + + ) + + # new_s3_keys = s3_resource.resource.ObjectSummary( + # Bucket=gleaner_s3.GLEANERIO_MINIO_BUCKET, + # Key=filename, + # + # ) + + # since_key = context.cursor or None + # new_s3_keys = get_s3_keys("my_s3_bucket", since_key=since_key) + + if not new_s3_keys: + return SkipReason(f"No new s3 files found for bucket {gleaner_s3.GLEANERIO_MINIO_BUCKET}. {filename}") + get_dagster_logger().info(f"metadata {new_s3_keys}") + #new_s3_keys = list(new_s3_keys) + last_key = str(new_s3_keys['LastModified']) + get_dagster_logger().info(f"last_modified: {last_key}") + run_requests =[] + if since_key is None or since_key < last_key: + #run_requests = [RunRequest(run_key=s3_key, run_config={}) for s3_key in new_s3_keys] + run_requests = [RunRequest(run_key=last_key, run_config={})] + context.update_cursor(last_key) + return run_requests + +@sensor(name="s3_configs_tenant__sensor", + default_status=DefaultSensorStatus.RUNNING, + #, job_name="sources_updated_job", + job=tenant_asset_job, + required_resource_keys={"gleanerio"}, + # minimum_interval_seconds=3600 + ) +def tenant_s3_sensor(context + ): + gleaner_resource = context.resources.gleanerio + s3_resource = context.resources.gleanerio.gs3.s3 + gleaner_s3 = context.resources.gleanerio.gs3 + triplestore = context.resources.gleanerio.triplestore + since_key = context.cursor or None + get_dagster_logger().info(f"sinceKey: {since_key}") + config_path=f"{gleaner_s3.GLEANERIO_CONFIG_PATH}" + filename = f"{gleaner_s3.GLEANERIO_CONFIG_PATH}{gleaner_s3.GLEANERIO_TENANT_FILENAME}" + + new_s3_keys = s3_resource.get_client().head_object( + Bucket=gleaner_s3.GLEANERIO_MINIO_BUCKET, + Key=filename, + + ) + + # new_s3_keys = s3_resource.resource.ObjectSummary( + # Bucket=gleaner_s3.GLEANERIO_MINIO_BUCKET, + # Key=filename, + # + # ) + + # since_key = context.cursor or None + # new_s3_keys = get_s3_keys("my_s3_bucket", since_key=since_key) + + if not new_s3_keys: + return SkipReason(f"No new s3 files found for bucket {gleaner_s3.GLEANERIO_MINIO_BUCKET}. {filename}") + get_dagster_logger().info(f"metadata {new_s3_keys}") + #new_s3_keys = list(new_s3_keys) + last_key = str(new_s3_keys['LastModified']) + get_dagster_logger().info(f"last_modified: {last_key}") + run_requests =[] + if since_key is None or since_key < last_key: + #run_requests = [RunRequest(run_key=s3_key, run_config={}) for s3_key in new_s3_keys] + run_requests = [RunRequest(run_key=last_key, run_config={})] + context.update_cursor(last_key) + return run_requests diff --git a/dagster/implnets/workflows/ingest/ingest/sensors/tenant_sensor.py b/dagster/implnets/workflows/ingest/ingest/sensors/tenant_sensor.py new file mode 100644 index 00000000..42fd704e --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest/sensors/tenant_sensor.py @@ -0,0 +1,104 @@ +from dagster import ( +op, job, Config,get_dagster_logger, +sensor, RunRequest, RunConfig,SensorResult, +SensorEvaluationContext,asset_sensor, EventLogEntry, +SkipReason, +AssetKey, +static_partitioned_config,DynamicPartitionsDefinition, +DefaultSensorStatus,DefaultScheduleStatus +) +from ..jobs.tenant_load import tenant_namespaces_job, release_asset_job +from ..assets import tenant_partitions_def +#from ..assets.tenant import build_community + +## Thinking. Doing this the wrong way. +## for each source, we dynamically generate a set of tenants to load, rather than for each tenant we reload +## So, at the end of a source load, we trigger a load tenants. +## this figures out what tenants to load, and call those ops. + +## So the asset key is not tenant names, it is still source_names_active. + +# now we do need to build tenants when a new tenant is added. +# this should just handle the cretion of namespaces, and adding the UI's + +@asset_sensor( asset_key=AssetKey(["ingest","tenant_names"]), + default_status=DefaultSensorStatus.RUNNING, +#default_status=DefaultScheduleStatus.RUNNING, + job=tenant_namespaces_job, + # jobs=[tenant_namespaces_job,release_asset_job] + # , minimum_interval_seconds=600 + ) +def tenant_names_sensor(context, asset_event: EventLogEntry): + context.log.info(f"tenant_names_sensor: start") + assert asset_event.dagster_event and asset_event.dagster_event.asset_key + context.log.info(f"asset_key: {asset_event.dagster_event.asset_key}") +# well this is a pain. but it works. Cannot just pass it like you do in ops + # otherwise it's just an AssetDefinition. + tenants = context.repository_def.load_asset_value(AssetKey(["ingest","tenant_names"])) + new_tenants = [ + tenant + for tenant in tenants + if not tenant_partitions_def.has_partition_key( + tenant, dynamic_partitions_store=context.instance + ) + ] + removed_tenants = [ + tenant + for tenant in tenant_partitions_def.get_partition_keys(dynamic_partitions_store=context.instance) + if not tenant in tenants + ] + for t in removed_tenants: + context.instance.delete_dynamic_partition("tenant_names_paritition", t) + context.log.info(f"Removed {removed_tenants}") + context.log.info(f"new tenant {new_tenants}") + return SensorResult( + run_requests=[ + RunRequest(partition_key=tenant + # , job_name=f"{source}_load" + , run_key=f"{tenant}_tenant" + ) for tenant in new_tenants + ], + dynamic_partitions_requests=[ + tenant_partitions_def.build_add_request(new_tenants) + ], + ) + +@asset_sensor( asset_key=AssetKey(["ingest","tenant_names"]), + default_status=DefaultSensorStatus.RUNNING, +#default_status=DefaultScheduleStatus.RUNNING, + # job=tenant_namespaces_job, + jobs=[tenant_namespaces_job,release_asset_job] + # , minimum_interval_seconds=600 + ) +def tenant_names_sensor_v2(context, asset_event: EventLogEntry): + assert asset_event.dagster_event and asset_event.dagster_event.asset_key + +# well this is a pain. but it works. Cannot just pass it like you do in ops + # otherwise it's just an AssetDefinition. + tenants = context.repository_def.load_asset_value(AssetKey(["ingest","tenant_names"])) + new_tenants = [ + tenant + for tenant in tenants + if not tenant_partitions_def.has_partition_key( + tenant, dynamic_partitions_store=context.instance + ) + ] +# in order for this to work, the tenant_release_job needs to be fed valid sources, +# from some aggreate from the sources in the new_tenants[*]['sources'] + + return SensorResult( + run_requests=[ + RunRequest(partition_key=tenant + , job_name="tenant_namespaces_job" + , run_key=f"{tenant}_tenant_namespace" + ) for tenant in new_tenants + ] + [ + RunRequest(partition_key=tenant + , job_name="tenant_release_job" + , run_key=f"{tenant}_tenant_release" + ) for tenant in new_tenants + ], + dynamic_partitions_requests=[ + tenant_partitions_def.build_add_request(new_tenants) + ], + ) diff --git a/dagster/implnets/workflows/ingest/ingest/utils.py b/dagster/implnets/workflows/ingest/ingest/utils.py new file mode 100644 index 00000000..df35ee78 --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest/utils.py @@ -0,0 +1,10 @@ + + +def PythonMinioAddress(url, port=None): + if (url.endswith(".amazonaws.com")): + PYTHON_MINIO_URL = "s3.amazonaws.com" + else: + PYTHON_MINIO_URL = url + if port is not None: + PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" + return PYTHON_MINIO_URL diff --git a/dagster/implnets/workflows/ingest/ingest_tests/__init__.py b/dagster/implnets/workflows/ingest/ingest_tests/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest_tests/__init__.py @@ -0,0 +1 @@ + diff --git a/dagster/implnets/workflows/ingest/ingest_tests/config.yaml b/dagster/implnets/workflows/ingest/ingest_tests/config.yaml new file mode 100644 index 00000000..e69de29b diff --git a/dagster/implnets/workflows/ingest/ingest_tests/test_assets.py b/dagster/implnets/workflows/ingest/ingest_tests/test_assets.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/dagster/implnets/workflows/ingest/ingest_tests/test_assets.py @@ -0,0 +1 @@ + diff --git a/dagster/implnets/workflows/ingest/pyproject.toml b/dagster/implnets/workflows/ingest/pyproject.toml new file mode 100644 index 00000000..30442bfc --- /dev/null +++ b/dagster/implnets/workflows/ingest/pyproject.toml @@ -0,0 +1,6 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[tool.dagster] +module_name = "ingest" diff --git a/dagster/implnets/workflows/ingest/setup.cfg b/dagster/implnets/workflows/ingest/setup.cfg new file mode 100644 index 00000000..e79daae3 --- /dev/null +++ b/dagster/implnets/workflows/ingest/setup.cfg @@ -0,0 +1,2 @@ +[metadata] +name = tutorial diff --git a/dagster/implnets/workflows/ingest/setup.py b/dagster/implnets/workflows/ingest/setup.py new file mode 100644 index 00000000..e38cdd3c --- /dev/null +++ b/dagster/implnets/workflows/ingest/setup.py @@ -0,0 +1,12 @@ +from setuptools import find_packages, setup + +setup( + name="tutorial", + packages=find_packages(exclude=["tutorial_tests"]), + install_requires=[ + "dagster", + "dagster-cloud", + "Faker==18.4.0", + ], + extras_require={"dev": ["dagit", "pytest"]}, +) diff --git a/dagster/implnets/workflows/tasks/data/source_list.json b/dagster/implnets/workflows/tasks/data/source_list.json new file mode 100644 index 00000000..0637a088 --- /dev/null +++ b/dagster/implnets/workflows/tasks/data/source_list.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/dagster/implnets/workflows/tasks/tasks/__init__.py b/dagster/implnets/workflows/tasks/tasks/__init__.py index a36556aa..6d55696d 100644 --- a/dagster/implnets/workflows/tasks/tasks/__init__.py +++ b/dagster/implnets/workflows/tasks/tasks/__init__.py @@ -1,9 +1,86 @@ -from dagster import Definitions, load_assets_from_modules - +import os +from distutils.util import strtobool +from dagster import Definitions, load_assets_from_modules, EnvVar +from dagster_aws.s3 import S3Resource +#from dagster_slack import SlackResource, make_slack_on_run_failure_sensor from . import assets +from .sch import weekly_sch +from .sch.s3_sensor import tenant_s3_sensor +from .assets.tenants import community_sensor + +from .resources.graph import BlazegraphResource, GraphResource +from .resources.gleanerS3 import gleanerS3Resource + +from dagster_slack import SlackResource, make_slack_on_run_failure_sensor +slack_on_run_failure = make_slack_on_run_failure_sensor( + os.getenv("SLACK_CHANNEL"), + os.getenv("SLACK_TOKEN") +) +def _awsEndpointAddress(url, port=None, use_ssl=True): + if use_ssl: + protocol = "https" + else: + protocol = "http" + if port is not None: + return f"{protocol}://{url}:{port}" + else: + return f"{protocol}://{url}" all_assets = load_assets_from_modules([assets]) +# as noted: https://docs.dagster.io/concepts/assets/software-defined-assets#from-assets-in-a-sub-module +# tried to use load_assets_from_modules([assets] , key_prefix=["tasks"]) +# this meant that the prefix had to included in the code... so, just add it individually +weekly_data_schedule=[ weekly_sch.loadstats_schedule, weekly_sch.all_graph_stats_schedule] +s3 = S3Resource( + endpoint_url=_awsEndpointAddress(EnvVar('GLEANERIO_MINIO_ADDRESS').get_value(), + port=EnvVar('GLEANERIO_MINIO_PORT').get_value()), + aws_access_key_id=EnvVar('GLEANERIO_MINIO_ACCESS_KEY'), + aws_secret_access_key=EnvVar('GLEANERIO_MINIO_SECRET_KEY') +) +minio=gleanerS3Resource( + s3=s3, + # GLEANER_MINIO_BUCKET =EnvVar('GLEANER_MINIO_BUCKET'), + # GLEANER_MINIO_ADDRESS=EnvVar('GLEANER_MINIO_ADDRESS'), + # GLEANER_MINIO_PORT=EnvVar('GLEANER_MINIO_PORT'), + + GLEANERIO_MINIO_BUCKET=EnvVar('GLEANERIO_MINIO_BUCKET'), + GLEANERIO_MINIO_ADDRESS=EnvVar('GLEANERIO_MINIO_ADDRESS'), + GLEANERIO_MINIO_PORT=EnvVar('GLEANERIO_MINIO_PORT'), + GLEANERIO_MINIO_ACCESS_KEY=EnvVar('GLEANERIO_MINIO_ACCESS_KEY'), + GLEANERIO_MINIO_SECRET_KEY=EnvVar('GLEANERIO_MINIO_SECRET_KEY'), + GLEANERIO_CONFIG_PATH=EnvVar('GLEANERIO_CONFIG_PATH'), + GLEANERIO_TENANT_FILENAME=EnvVar('GLEANERIO_TENANT_FILENAME') + +) +triplestore=BlazegraphResource( + GLEANERIO_GRAPH_URL=EnvVar('GLEANERIO_GRAPH_URL'), + GLEANERIO_GRAPH_NAMESPACE=EnvVar('GLEANERIO_GRAPH_NAMESPACE'), + GLEANERIO_GRAPH_SUMMARY_NAMESPACE=EnvVar('GLEANERIO_GRAPH_SUMMARY_NAMESPACE'), + GLEANERIO_GRAPH_SUMMARIZE=EnvVar('GLEANERIO_GRAPH_SUMMARIZE'), + s3=minio, + ) + + +resources = { + "local": { + + "s3":minio, + "triplestore": triplestore, + # "slack": SlackResource(token=EnvVar("SLACK_TOKEN")), + }, + "production": { + + "s3":minio, + "triplestore":triplestore, + # "slack":SlackResource(token=EnvVar("SLACK_TOKEN")), + }, +} + +deployment_name = os.environ.get("DAGSTER_DEPLOYMENT", "local") defs = Definitions( assets=all_assets, + schedules=weekly_data_schedule, + resources=resources[deployment_name], + sensors=[community_sensor, tenant_s3_sensor, slack_on_run_failure] ) diff --git a/dagster/implnets/workflows/tasks/tasks/assets.py b/dagster/implnets/workflows/tasks/tasks/assets.py deleted file mode 100644 index 7260b45a..00000000 --- a/dagster/implnets/workflows/tasks/tasks/assets.py +++ /dev/null @@ -1,62 +0,0 @@ -import json -import os - -import pandas as pd -from dagster import asset, get_dagster_logger -from ec.datastore import s3 - -GLEANER_MINIO_ADDRESS = os.environ.get('GLEANERIO_MINIO_ADDRESS') -GLEANER_MINIO_PORT = os.environ.get('GLEANERIO_MINIO_PORT') -GLEANER_MINIO_USE_SSL = os.environ.get('GLEANERIO_MINIO_USE_SSL') -GLEANER_MINIO_SECRET_KEY = os.environ.get('GLEANERIO_MINIO_SECRET_KEY') -GLEANER_MINIO_ACCESS_KEY = os.environ.get('GLEANERIO_MINIO_ACCESS_KEY') -GLEANER_MINIO_BUCKET = os.environ.get('GLEANERIO_MINIO_BUCKET') -# set for the earhtcube utiltiies -MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL - - ,"access_key": GLEANER_MINIO_ACCESS_KEY - ,"secret_key": GLEANER_MINIO_SECRET_KEY - } -REPORT_PATH = "reports/" -ORG_PATH = "orgs/" -STAT_FILE_NAME = "missing_report_graph.json" -def _pythonMinioUrl(url): - - if (url.endswith(".amazonaws.com")): - PYTHON_MINIO_URL = "s3.amazonaws.com" - else: - PYTHON_MINIO_URL = url - return PYTHON_MINIO_URL - -def getName(name): - return name.replace("orgs/","").replace(".nq","") -@asset() -def source_list() -> None: - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), MINIO_OPTIONS) - orglist = s3Minio.listPath(GLEANER_MINIO_BUCKET, ORG_PATH,recursive=False) - sources = map( lambda f: { "name": getName(f.object_name)}, orglist ) - - os.makedirs("data", exist_ok=True) - - - with open("data/source_list.json", "w") as f: - json.dump(list(sources), f) -#@asset(deps=[source_list]) -@asset(deps=[source_list]) -def loadstats() -> None: - logger = get_dagster_logger() - s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS),MINIO_OPTIONS) - # sourcelist = list(s3Minio.listPath(GLEANER_MINIO_BUCKET, ORG_PATH,recursive=False)) - - with open("data/source_list.json","r" ) as f: - sourcelist = json.load(f) - stats = [] - for source in sourcelist: - try: - stat = s3Minio.getReportFile(GLEANER_MINIO_BUCKET,source.get("name"), STAT_FILE_NAME ) - stat = json.loads(stat) - stats.append(stat) - except: - logger.info(f"Failed to get { source.get('name')} ") - df = pd.DataFrame(stats) - df.to_csv("data/weekly_stats.csv") diff --git a/dagster/implnets/workflows/tasks/tasks/assets/__init__.py b/dagster/implnets/workflows/tasks/tasks/assets/__init__.py new file mode 100644 index 00000000..50ddb949 --- /dev/null +++ b/dagster/implnets/workflows/tasks/tasks/assets/__init__.py @@ -0,0 +1,3 @@ +from .source_stats import source_list, loadstatsHistory +from .all_graph_stats import sos_types, all_report_stats +from .tenants import task_tenant_sources, task_tenant_names, loadstatsCommunity diff --git a/dagster/implnets/workflows/tasks/tasks/assets/all_graph_stats.py b/dagster/implnets/workflows/tasks/tasks/assets/all_graph_stats.py new file mode 100644 index 00000000..63f1dcaf --- /dev/null +++ b/dagster/implnets/workflows/tasks/tasks/assets/all_graph_stats.py @@ -0,0 +1,123 @@ +from distutils import util +import json +import os + +from dagster import asset, define_asset_job, get_dagster_logger, AssetKey +from ec.graph.sparql_query import queryWithSparql +from ec.reporting.report import generateGraphReportsRepo, reportTypes, generateReportStats +from ec.datastore import s3 +from ec.logger import config_app +from .tenants import task_tenant_names +from pydash import find + +log = config_app() + + +# GLEANERIO_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) +# GLEANERIO_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) +# GLEANERIO_MINIO_USE_SSL = bool(util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL', 'true'))) +# GLEANERIO_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) +# GLEANERIO_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) +# GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) +# +# # set for the earhtcube utiltiies +# MINIO_OPTIONS={"secure":GLEANERIO_MINIO_USE_SSL +# +# ,"access_key": GLEANERIO_MINIO_ACCESS_KEY +# ,"secret_key": GLEANERIO_MINIO_SECRET_KEY +# } +# +# GLEANERIO_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) +# # using GLEANER, even though this is a nabu property... same prefix seems easier +# GLEANERIO_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) +# GLEANERIO_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) +# GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) +# GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) +# GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) +# GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) +# GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner +# GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) +# GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) +# GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) +# GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) +# GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) +# GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) +# GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) +# #GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') +# GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_GRAPH_NAMESPACE',f"{GLEANERIO_GRAPH_NAMESPACE}_summary" ) +# GLEANERIO_SUMMARIZE_GRAPH=(os.getenv('GLEANERIO_GRAPH_SUMMARIZE', 'False').lower() == 'true') +# GLEANERIO_CSV_CONFIG_URL = str(os.environ.get('GLEANERIO_CSV_CONFIG_URL')) + +SUMMARY_PATH = 'graphs/summary' +RELEASE_PATH = 'graphs/latest' + +def _graphSummaryEndpoint(community, graph_resoruce): + if community == "all": + url = f"{graph_resoruce.GLEANERIO_GRAPH_URL}/namespace/{graph_resoruce.GLEANERIO_GRAPH_SUMMARY_NAMESPACE}/sparql" + else: + url = f"{graph_resoruce.GLEANERIO_GRAPH_URL}/namespace/{community}_summary/sparql" + return url +@asset(group_name="graph",key_prefix="task", required_resource_keys={"triplestore"}) +def sos_types(context ): + s3_resource = context.resources.triplestore.s3 + graph_resource = context.resources.triplestore + graphendpoint = f"{graph_resource.GLEANERIO_GRAPH_URL}/namespace/{graph_resource.GLEANERIO_GRAPH_NAMESPACE}/sparql" + get_dagster_logger().info("sos types endpoint: {}".format(graphendpoint)) + report = queryWithSparql("all_count_types", graphendpoint, parameters=None) + report_csv =report.to_csv() + # report_json = generateGraphReportsRepo("all", + # "", reportList=reportTypes["all"]) + MINIO_OPTIONS={"secure":s3_resource.GLEANERIO_MINIO_USE_SSL + + ,"access_key": s3_resource.GLEANERIO_MINIO_ACCESS_KEY + ,"secret_key": s3_resource.GLEANERIO_MINIO_SECRET_KEY + } + s3Minio = s3.MinioDatastore( s3_resource.GLEANERIO_MINIO_ADDRESS, MINIO_OPTIONS) + #data = f.getvalue() + + bucketname, objectname = s3Minio.putReportFile(s3_resource.GLEANERIO_MINIO_BUCKET,"all","sos_types.csv",report_csv) + return bucketname, objectname, report_csv + +#@asset(group_name="graph",key_prefix="task", required_resource_keys={"triplestore"}) +def all_report_stats(context, task_tenant_names): + s3_resource = context.resources.triplestore.s3 + graph_resource = context.resources.triplestore + MINIO_OPTIONS={"secure":s3_resource.GLEANERIO_MINIO_USE_SSL + + ,"access_key": s3_resource.GLEANERIO_MINIO_ACCESS_KEY + ,"secret_key": s3_resource.GLEANERIO_MINIO_SECRET_KEY + } + s3Minio = s3.MinioDatastore( s3_resource.GLEANERIO_MINIO_ADDRESS, MINIO_OPTIONS) + bucket = s3_resource.GLEANERIO_MINIO_BUCKET + # this is a file with a list of sources for a community. T + # this now exists in the tenant configuration file. + #source_url = s3_resource.GLEANERIO_CSV_CONFIG_URL + + tenants_all = context.repository_def.load_asset_value(AssetKey("tenant_all"))['tenant'] + + # TODO: remove the hardcoded community list + #community_list = ["all", "deepoceans", "ecoforecast", "geochemistry"] + #community_list = context.repository_def.load_asset_value(AssetKey("tenant_names")) + community_list = task_tenant_names + if (graph_resource.GLEANERIO_GRAPH_SUMMARIZE): + for community in community_list: + community_tenant = find(tenants_all, lambda x: x["community"] == community) + community_sources = community_tenant.get('sources') + try: + # update generateReportStats to take an array of source names + report = generateReportStats(community_sources, bucket, s3Minio, _graphSummaryEndpoint(community), community) + bucketname, objectname = s3Minio.putReportFile(bucket, "all", f"report_{community}_stats.json", report) + except Exception as e: + get_dagster_logger().info(f"Summary report errors: {str(e)}") + +#all_urn_w_types_toplevel.sparql +# returns all grapurns with a type. +# def top_level_types(): +# graphendpoint = f"{GLEANERIO_GRAPH_URL}/namespace/{GLEANERIO_GRAPH_NAMESPACE}/sparql" +# report = queryWithSparql("all_urn_w_types_toplevel", graphendpoint, parameters=None) +# report_csv =report.to_csv() +# # report_json = generateGraphReportsRepo("all", +# # "", reportList=reportTypes["all"]) +# s3Minio = s3.MinioDatastore( GLEANERIO_MINIO_ADDRESS, MINIO_OPTIONS) + + diff --git a/dagster/implnets/workflows/tasks/tasks/assets/source_stats.py b/dagster/implnets/workflows/tasks/tasks/assets/source_stats.py new file mode 100644 index 00000000..2c465d77 --- /dev/null +++ b/dagster/implnets/workflows/tasks/tasks/assets/source_stats.py @@ -0,0 +1,96 @@ +import distutils +import json +import os +from typing import List, Any +import pandas as pd +from dagster import asset, get_dagster_logger, define_asset_job +from ec.datastore import s3 +from pydash import pick +from distutils import util + +GLEANER_MINIO_ADDRESS = os.environ.get('GLEANERIO_MINIO_ADDRESS') +GLEANER_MINIO_PORT = os.environ.get('GLEANERIO_MINIO_PORT') +GLEANER_MINIO_USE_SSL = bool(util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL', 'true'))) +GLEANER_MINIO_SECRET_KEY = os.environ.get('GLEANERIO_MINIO_SECRET_KEY') +GLEANER_MINIO_ACCESS_KEY = os.environ.get('GLEANERIO_MINIO_ACCESS_KEY') +GLEANER_MINIO_BUCKET = os.environ.get('GLEANERIO_MINIO_BUCKET') +# set for the earhtcube utiltiies +MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL + + ,"access_key": GLEANER_MINIO_ACCESS_KEY + ,"secret_key": GLEANER_MINIO_SECRET_KEY + } +REPORT_PATH = "reports/" +TASKS_PATH="tasks/" +ORG_PATH = "orgs/" +STAT_FILE_NAME = "load_report_graph.json" +def _pythonMinioUrl(url): + + if (url.endswith(".amazonaws.com")): + PYTHON_MINIO_URL = "s3.amazonaws.com" + else: + PYTHON_MINIO_URL = url + return PYTHON_MINIO_URL + +def getName(name): + return name.replace("orgs/","").replace(".nq","") +@asset(group_name="load",key_prefix="task",) +def source_list() -> List[Any]: + s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), MINIO_OPTIONS) + orglist = s3Minio.listPath(GLEANER_MINIO_BUCKET, ORG_PATH,recursive=False) + sources = map( lambda f: { "name": getName(f.object_name)}, orglist ) + sources=list(sources) + source_json = json.dumps(sources) + os.makedirs("data", exist_ok=True) + + s3Minio.putReportFile(GLEANER_MINIO_BUCKET, "all", f"source_list.json", source_json ) + # with open("data/source_list.json", "w") as f: + # json.dump(list(sources), f) + return sources +#@asset(deps=[source_list]) + +# set a prefix so we can have some named stats file + +#@asset( group_name="load",key_prefix="task",) +@asset(group_name="load",key_prefix="task",) +def loadstatsHistory(context,source_list) -> str: + prefix="history" + logger = get_dagster_logger() + s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS),MINIO_OPTIONS) + # sourcelist = list(s3Minio.listPath(GLEANER_MINIO_BUCKET, ORG_PATH,recursive=False)) + + # with open("data/source_list.json","r" ) as f: + # sourcelist = json.load(f) + sourcelist=source_list + stats = [] + for source in sourcelist: + try: + # stat = s3Minio.getReportFile(GLEANER_MINIO_BUCKET,source.get("name"), STAT_FILE_NAME ) + repo = source.get("name") + dirs = s3Minio.listPath( GLEANER_MINIO_BUCKET,f"{REPORT_PATH}{repo}/",recursive=False ) + for d in dirs: + latestpath = f"{REPORT_PATH}{repo}/latest/" + if (d.object_name.casefold() == latestpath.casefold()) or (d.is_dir == False): + continue + path = f"{d.object_name}{STAT_FILE_NAME}" + s3ObjectInfo = {"bucket_name": GLEANER_MINIO_BUCKET, "object_name": path} + try: + resp = s3Minio.getFileFromStore(s3ObjectInfo) + stat = json.loads(resp) + stat = pick(stat, 'source', 'sitemap', 'date', 'sitemap_count', 'summoned_count', + 'missing_sitemap_summon_count', + 'graph_urn_count', 'missing_summon_graph_count') + stats.append(stat) + except Exception as ex: + logger.info(f"no missing graph report {source.get('name')} {ex}") + except Exception as ex: + logger.info(f"Failed to get { source.get('name')} {ex}") + df = pd.DataFrame(stats) + df.to_csv(f"data/all_stats.csv") + df_csv = df.to_csv() + s3Minio.putReportFile(GLEANER_MINIO_BUCKET, "all", f"all_stats.csv", df_csv) + context.log.info(f"all_stats.csv uploaded using putReportFile s3://{GLEANER_MINIO_BUCKET} all ") + #return df_csv + return df_csv + + diff --git a/dagster/implnets/workflows/tasks/tasks/assets/tenants.py b/dagster/implnets/workflows/tasks/tasks/assets/tenants.py new file mode 100644 index 00000000..8eab8fcc --- /dev/null +++ b/dagster/implnets/workflows/tasks/tasks/assets/tenants.py @@ -0,0 +1,231 @@ +import json +from typing import Any +from io import StringIO +import yaml +import os +import pandas as pd +from pydash import pick +from dagster import (asset, + get_dagster_logger, + Output, + DynamicPartitionsDefinition, + define_asset_job, + AssetSelection, + sensor,SensorResult,DefaultSensorStatus, + RunRequest, +asset_sensor, AssetKey, + ) +from ec.datastore import s3 +from distutils import util +from ..resources.gleanerS3 import _pythonMinioAddress +from ec.reporting.report import generateReportStats + +GLEANER_MINIO_ADDRESS = os.environ.get('GLEANERIO_MINIO_ADDRESS') +GLEANER_MINIO_PORT = os.environ.get('GLEANERIO_MINIO_PORT') +GLEANER_MINIO_USE_SSL = bool(util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL', 'true'))) +GLEANER_MINIO_SECRET_KEY = os.environ.get('GLEANERIO_MINIO_SECRET_KEY') +GLEANER_MINIO_ACCESS_KEY = os.environ.get('GLEANERIO_MINIO_ACCESS_KEY') +GLEANER_MINIO_BUCKET = os.environ.get('GLEANERIO_MINIO_BUCKET') +GLEANERIO_GRAPH_URL = os.environ.get('GLEANERIO_GRAPH_URL') +GLEANERIO_GRAPH_SUMMARY_NAMESPACE = os.environ.get('GLEANERIO_GRAPH_SUMMARY_NAMESPACE') +GLEANERIO_CSV_CONFIG_URL = os.environ.get('GLEANERIO_CSV_CONFIG_URL') + +MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL + + ,"access_key": GLEANER_MINIO_ACCESS_KEY + ,"secret_key": GLEANER_MINIO_SECRET_KEY + } + +def _graphSummaryEndpoint(community): + if community == "all": + url = f"{GLEANERIO_GRAPH_URL}/namespace/{GLEANERIO_GRAPH_SUMMARY_NAMESPACE}/sparql" + else: + url = f"{GLEANERIO_GRAPH_URL}/namespace/{community}_summary/sparql" + return url +@asset(group_name="community",key_prefix="task", + required_resource_keys={"triplestore"}) +def task_tenant_sources(context) ->Any: + s3_resource = context.resources.triplestore.s3 + t=s3_resource.getTennatInfo() + tenants = t['tenant'] + listTenants = map (lambda a: {a['community']}, tenants) + get_dagster_logger().info(str(t)) + + return t + # metadata={ + # "tennants": str(listTenants), # Metadata can be any key-value pair + # "run": "gleaner", + # # The `MetadataValue` class has useful static methods to build Metadata + # } + # ) +@asset(group_name="community",key_prefix="task", + #name='task_tenant_names', + required_resource_keys={"triplestore"}) +def task_tenant_names(context, task_tenant_sources) -> Output[Any]: + + tenants = task_tenant_sources['tenant'] + listTenants = map (lambda a: a['community'], tenants) + get_dagster_logger().info(str(listTenants)) + communities = list(listTenants) + return Output( + communities, + metadata={ + "tenants": str(listTenants), # Metadata can be any key-value pair + "run": "gleaner", + # The `MetadataValue` class has useful static methods to build Metadata + } + ) + + +community_partitions_def = DynamicPartitionsDefinition(name="tenantsPartition") +tenant_task_job = define_asset_job( + "tenant_job", AssetSelection.keys(AssetKey(["task","loadstatsCommunity"])), partitions_def=community_partitions_def +) +#@sensor(job=tenant_job) +@asset_sensor(asset_key=AssetKey(["task","task_tenant_names"]), + default_status=DefaultSensorStatus.RUNNING, + job=tenant_task_job) +def community_sensor(context): + tenants = context.repository_def.load_asset_value(AssetKey(["task","task_tenant_names"])) + new_community = [ + community + for community in tenants + if not context.instance.has_dynamic_partition( + community_partitions_def.name, community + ) + ] + + return SensorResult( + run_requests=[ + RunRequest(partition_key=community) for community in new_community + ], + dynamic_partitions_requests=[ + community_partitions_def.build_add_request(new_community) + ], + ) +REPORT_PATH = "reports/" +COMMUNITY_PATH = "reports/community/" +TASKS_PATH="tasks/" +ORG_PATH = "orgs/" +STAT_FILE_NAME = "load_report_graph.json" + +def _pythonMinioUrl(url): + + if (url.endswith(".amazonaws.com")): + PYTHON_MINIO_URL = "s3.amazonaws.com" + else: + PYTHON_MINIO_URL = url + return PYTHON_MINIO_URL + +def getName(name): + return name.replace("orgs/","").replace(".nq","") +# @asset(group_name="community") +# def source_list(task_tenant_sources) -> Output(str): +# s3Minio = s3.MinioDatastore(_pythonMinioUrl(GLEANER_MINIO_ADDRESS), MINIO_OPTIONS) +# orglist = s3Minio.listPath(GLEANER_MINIO_BUCKET, ORG_PATH,recursive=False) +# sources = map( lambda f: { "name": getName(f.object_name)}, orglist ) +# source_json = json.dumps(list(sources)) +# os.makedirs("data", exist_ok=True) +# +# s3Minio.putReportFile(GLEANER_MINIO_BUCKET, "all", f"source_list.json", source_json ) +# with open("data/source_list.json", "w") as f: +# json.dump(list(sources), f) +# return source_json +#@asset(deps=[source_list]) + +# set a prefix so we can have some named stats file + +#@asset( group_name="load") +@asset(partitions_def=community_partitions_def, + deps=[AssetKey(["task","task_tenant_sources"])], + group_name="community", + key_prefix="task", + required_resource_keys={"triplestore"} ) +def loadstatsCommunity(context, task_tenant_sources) -> str: + prefix="history" + logger = get_dagster_logger() + s3_config = context.resources.triplestore.s3 + s3Client = context.resources.triplestore.s3.s3.get_client() + s3Minio = s3.MinioDatastore(_pythonMinioUrl(s3_config.GLEANERIO_MINIO_ADDRESS), MINIO_OPTIONS) + # sourcelist = list(s3Minio.listPath(GLEANER_MINIO_BUCKET, ORG_PATH,recursive=False)) + community_code= context.asset_partition_key_for_output() + stats = [] + try: + ts = task_tenant_sources + t =list(filter ( lambda a: a['community']== community_code, ts["tenant"] )) + s = t[0]["sources"] + + for source in s: + dirs = s3Minio.listPath(GLEANER_MINIO_BUCKET,path=f"{REPORT_PATH}{source}/",recursive=False ) + for d in dirs: + latestpath = f"{REPORT_PATH}{source}/latest/" + if (d.object_name.casefold() == latestpath.casefold()) or (d.is_dir == False): + continue + path = f"{d.object_name}{STAT_FILE_NAME}" + s3ObjectInfo = {"bucket_name": GLEANER_MINIO_BUCKET, "object_name": path} + try: + # resp = s3Client.getFile(path=path) + resp = s3Minio.getFileFromStore(s3ObjectInfo) + stat = json.loads(resp) + stat = pick(stat, 'source', 'sitemap', 'date', 'sitemap_count', 'summoned_count', + 'missing_sitemap_summon_count', + 'graph_urn_count', 'missing_summon_graph_count') + stats.append(stat) + except Exception as ex: + context.log.info(f"Failed to get source {source} for tennant {community_code} {ex}") + except Exception as ex: + context.log.info(f"Failed to get tenant {community_code} {ex}") + # for source in task_tenant_sources["tennant"]: + # try: + # # stat = s3Minio.getReportFile(GLEANER_MINIO_BUCKET,source.get("name"), STAT_FILE_NAME ) + # repo = community_code + # dirs = s3Minio.listPath( path=f"{REPORT_PATH}{repo}/",recursive=False ) + # for d in dirs: + # latestpath = f"{REPORT_PATH}{repo}/latest/" + # if (d.object_name.casefold() == latestpath.casefold()) or (d.is_dir == False): + # continue + # path = f"/{d.object_name}{STAT_FILE_NAME}" + # + # try: + # resp = s3Minio.getFile(path=path) + # stat = json.loads(resp) + # stat = pick(stat, 'source', 'sitemap', 'date', 'sitemap_count', 'summoned_count', + # 'missing_sitemap_summon_count', + # 'graph_urn_count', 'missing_summon_graph_count') + # stats.append(stat) + # except Exception as ex: + # logger.info(f"no missing graph report {source.get('name')} {ex}") + # except Exception as ex: + # logger.info(f"Failed to get { source.get('name')} {ex}") + context.log.info(stats) + df = pd.DataFrame(stats) + context.log.info(df) + # try: + # os.mkdir(f"data/{community_code}") + # except FileExistsError: + # logger.debug(f"directory data/{community_code} exists") + # except FileNotFoundError: + # logger.error(f"error creating directory. Fix community name. 'data/{community_code}' ") + #df.to_csv(f"data/{community_code}/all_stats.csv") + + df_csv = df.to_csv() + + # stringio = StringIO(df_csv) + # s3Client.upload_fileobj(stringio, s3_config.GLEANERIO_MINIO_BUCKET, f"data/{community_code}/all_stats.csv") + # humm, should we just have an EC utils resource + s3Minio.putReportFile(s3_config.GLEANERIO_MINIO_BUCKET, f"tenant/{community_code}", f"all_stats.csv", df_csv) + # with open(stringio, "rb") as f: + # s3.upload_fileobj(f, s3.GLEANERIO_MINIO_BUCKET, f"data/all/all_stats.csv") + context.log.info(f"all_stats.csv uploaded using ec.datastore.putReportFile {s3_config.GLEANERIO_MINIO_BUCKET}tenant/{community_code} ") + #return df_csv # now checking return types + + context.log.info(f"GLEANERIO_CSV_CONFIG_URL {GLEANERIO_CSV_CONFIG_URL} ") + + report = generateReportStats(GLEANERIO_CSV_CONFIG_URL, s3_config.GLEANERIO_MINIO_BUCKET, s3Minio, + _graphSummaryEndpoint(community_code), community_code) + bucket, object = s3Minio.putReportFile(s3_config.GLEANERIO_MINIO_BUCKET, f"tenant/{community_code}", + f"report_stats.json", report) + context.log.info( + f"report_stats.json uploaded using ec.datastore.putReportFile {s3_config.GLEANERIO_MINIO_BUCKET}tenant/{community_code} ") + + return df_csv diff --git a/dagster/implnets/workflows/tasks/tasks/jobs/jobs.py b/dagster/implnets/workflows/tasks/tasks/jobs/jobs.py new file mode 100644 index 00000000..1ba501bd --- /dev/null +++ b/dagster/implnets/workflows/tasks/tasks/jobs/jobs.py @@ -0,0 +1,22 @@ +from dagster import ( + op, job, Config, + sensor, RunRequest, RunConfig, + SensorEvaluationContext, asset_sensor, EventLogEntry, + SkipReason, + AssetKey, + static_partitioned_config, dynamic_partitioned_config, DynamicPartitionsDefinition, + define_asset_job, AssetSelection, graph_asset, + BackfillPolicy +) +from ..assets import task_tenant_sources + +from dagster_aws.s3.sensor import get_s3_keys +from typing import List, Dict +from pydantic import Field + + +tenant_asset_job = define_asset_job( + name="task_tenant_config_updated_job", + selection=AssetSelection.assets(task_tenant_sources), + +) diff --git a/dagster/implnets/workflows/tasks/tasks/resources/gleanerS3.py b/dagster/implnets/workflows/tasks/tasks/resources/gleanerS3.py new file mode 100644 index 00000000..01fdfb45 --- /dev/null +++ b/dagster/implnets/workflows/tasks/tasks/resources/gleanerS3.py @@ -0,0 +1,62 @@ +import yaml +from dagster import asset, get_dagster_logger, define_asset_job, ConfigurableResource +from dagster_aws.s3 import S3Resource + +#from dagster import Field +from pydantic import Field + +def _pythonMinioAddress(url, port=None): + if (url.endswith(".amazonaws.com")): + PYTHON_MINIO_URL = "s3.amazonaws.com" + else: + PYTHON_MINIO_URL = url + if port is not None: + PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" + return PYTHON_MINIO_URL + + +class gleanerS3Resource(ConfigurableResource): + s3: S3Resource + GLEANERIO_MINIO_BUCKET: str = Field( + description="GLEANERIO_MINIO_BUCKET.") + GLEANERIO_MINIO_ADDRESS: str = Field( + description="GLEANERIO_MINIO_BUCKET.") + GLEANERIO_MINIO_PORT: str = Field( + description="GLEANERIO_MINIO_BUCKET.") + GLEANERIO_MINIO_USE_SSL: bool = Field( + default=False) + GLEANERIO_CONFIG_PATH : str = Field( + description="GLEANERIO_CONFIG_PATH.", default="scheduler/configs/test/") + GLEANERIO_TENANT_FILENAME : str = Field( + description="GLEANERIO_TENANT_CONFIG.", default="tenant.yaml") + # now using the boto s3 embedded in dagster_aws, but just in case we need them + GLEANERIO_MINIO_ACCESS_KEY: str = Field( + description="GLEANERIO_MINIO_ACCESS_KEY") + GLEANERIO_MINIO_SECRET_KEY: str = Field( + description="GLEANERIO_MINIO_SECRET_KEY") +## https://docs.dagster.io/_apidocs/libraries/dagster-aws#s3 +# fields from dagster_aws.s3.S3Resource +# region_name +# endpoint_url +# use_ssl +# aws_access_key_id +# aws_secret_access_key + def listPath(self, path='orgs'): + return self.s3.get_client().list_objects( + Bucket=self.GLEANERIO_MINIO_BUCKET, + Prefix=path, + + )["Contents"] + + def getTennatInfo(self, path='orgs'): + path= f"{self.GLEANERIO_CONFIG_PATH}{self.GLEANERIO_TENANT_FILENAME}" + try: + r = self.s3.get_client().get_object( + Bucket=self.GLEANERIO_MINIO_BUCKET, + Key=path, + ) + return yaml.safe_load(r["Body"]) + except Exception as ex: + get_dagster_logger().info(f"tennant file {path} not found in bucket {self.GLEANERIO_MINIO_BUCKET} at {self.GLEANERIO_MINIO_ADDRESS} ") + raise ex + #endpoint_url =_pythonMinioAddress(GLEANER_MINIO_ADDRESS, port=GLEANER_MINIO_PORT) diff --git a/dagster/implnets/workflows/tasks/tasks/resources/graph.py b/dagster/implnets/workflows/tasks/tasks/resources/graph.py new file mode 100644 index 00000000..44f09303 --- /dev/null +++ b/dagster/implnets/workflows/tasks/tasks/resources/graph.py @@ -0,0 +1,138 @@ +import os +from typing import Any, Dict + +import pydash +from dagster import ConfigurableResource, Config, EnvVar, get_dagster_logger + +#from dagster import Field +from pydantic import Field +import requests +from .gleanerS3 import gleanerS3Resource +#Let's try to use dasgeter aws as the minio configuration + +# class AirtableConfig(Config): +# DAGSTER_GLEANER_CONFIG_PATH = os.environ.get('DAGSTER_GLEANER_CONFIG_PATH', "/scheduler/gleanerconfig.yaml") +# +# # Vars and Envs +# GLEANER_HEADLESS_NETWORK=os.environ.get('GLEANERIO_HEADLESS_NETWORK', "headless_gleanerio") +# # env items +# URL = os.environ.get('PORTAINER_URL') +# APIKEY = os.environ.get('PORTAINER_KEY') +# CONTAINER_WAIT_TIMEOUT= os.environ.get('GLEANERIO_CONTAINER_WAIT_SECONDS', 5) +# +# Let's try to use dasgeter aws as the minio configuration +# GLEANER_MINIO_ADDRESS = str(os.environ.get('GLEANERIO_MINIO_ADDRESS')) +# GLEANER_MINIO_PORT = str(os.environ.get('GLEANERIO_MINIO_PORT')) +# GLEANER_MINIO_USE_SSL = bool(distutils.util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL'))) +# GLEANER_MINIO_SECRET_KEY = str(os.environ.get('GLEANERIO_MINIO_SECRET_KEY')) +# GLEANER_MINIO_ACCESS_KEY = str(os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')) +# GLEANER_MINIO_BUCKET =str( os.environ.get('GLEANERIO_MINIO_BUCKET')) +# +# # set for the earhtcube utiltiies +# MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL +# +# ,"access_key": GLEANER_MINIO_ACCESS_KEY +# ,"secret_key": GLEANER_MINIO_SECRET_KEY +# } +# +# GLEANER_HEADLESS_ENDPOINT = str(os.environ.get('GLEANERIO_HEADLESS_ENDPOINT', "http://headless:9222")) +# # using GLEANER, even though this is a nabu property... same prefix seems easier +# GLEANER_GRAPH_URL = str(os.environ.get('GLEANERIO_GRAPH_URL')) +# GLEANER_GRAPH_NAMESPACE = str(os.environ.get('GLEANERIO_GRAPH_NAMESPACE')) +# GLEANERIO_GLEANER_CONFIG_PATH= str(os.environ.get('GLEANERIO_GLEANER_CONFIG_PATH', "/gleaner/gleanerconfig.yaml")) +# GLEANERIO_NABU_CONFIG_PATH= str(os.environ.get('GLEANERIO_NABU_CONFIG_PATH', "/nabu/nabuconfig.yaml")) +# GLEANERIO_GLEANER_IMAGE =str( os.environ.get('GLEANERIO_GLEANER_IMAGE', 'nsfearthcube/gleaner:latest')) +# GLEANERIO_NABU_IMAGE = str(os.environ.get('GLEANERIO_NABU_IMAGE', 'nsfearthcube/nabu:latest')) +# GLEANERIO_LOG_PREFIX = str(os.environ.get('GLEANERIO_LOG_PREFIX', 'scheduler/logs/')) # path to logs in nabu/gleaner +# GLEANERIO_GLEANER_ARCHIVE_OBJECT = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_OBJECT', 'scheduler/configs/GleanerCfg.tgz')) +# GLEANERIO_GLEANER_ARCHIVE_PATH = str(os.environ.get('GLEANERIO_GLEANER_ARCHIVE_PATH', '/gleaner/')) +# GLEANERIO_NABU_ARCHIVE_OBJECT=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_OBJECT', 'scheduler/configs/NabuCfg.tgz')) +# GLEANERIO_NABU_ARCHIVE_PATH=str(os.environ.get('GLEANERIO_NABU_ARCHIVE_PATH', '/nabu/')) +# GLEANERIO_GLEANER_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_GLEANER_DOCKER_CONFIG', 'gleaner')) +# GLEANERIO_NABU_DOCKER_CONFIG=str(os.environ.get('GLEANERIO_NABU_DOCKER_CONFIG', 'nabu')) +# #GLEANERIO_SUMMARY_GRAPH_ENDPOINT = os.environ.get('GLEANERIO_SUMMARY_GRAPH_ENDPOINT') +# GLEANERIO_SUMMARY_GRAPH_NAMESPACE = os.environ.get('GLEANERIO_SUMMARY_GRAPH_NAMESPACE',f"{GLEANER_GRAPH_NAMESPACE}_summary" ) +# +# SUMMARY_PATH = 'graphs/summary' +# RELEASE_PATH = 'graphs/latest' + + +class GraphResource(ConfigurableResource): + GLEANERIO_GRAPH_URL: str = Field( + description="GLEANERIO_GRAPH_URL.") + GLEANERIO_GRAPH_NAMESPACE: str = Field( + description="GLEANERIO_GRAPH_NAMESPACE.") + GLEANERIO_GRAPH_SUMMARY_NAMESPACE: str = Field( + description="GLEANERIO_GRAPH_SUMMARY_NAMESPACE.") + GLEANERIO_GRAPH_SUMMARIZE: str = Field( + description="GLEANERIO_GRAPH_SUMMARIZE.") + s3: gleanerS3Resource + +# need multiple namespaces. let's do this. + def GraphEndpoint(self, namespace): + url = f"{self.GLEANERIO_GRAPH_URL}/namespace/{namespace}/sparql" + return url + + def PythonMinioAddress(url, port=None): + + if (url.endswith(".amazonaws.com")): + PYTHON_MINIO_URL = "s3.amazonaws.com" + else: + PYTHON_MINIO_URL = url + if port is not None: + PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" + return PYTHON_MINIO_URL + def post_to_graph(self, source, path='graphs/latest', extension="nq", graphendpoint=None): + if graphendpoint is None: + graphendpoint = self.GraphEndpoint() + # revision of EC utilities, will have a insertFromURL + #instance = mg.ManageBlazegraph(os.environ.get('GLEANER_GRAPH_URL'),os.environ.get('GLEANER_GRAPH_NAMESPACE') ) + proto = "http" +# this need to get file from s3. + + if self.GLEANERIO_MINIO_USE_SSL: + proto = "https" + port = self.GLEANERIO_MINIO_PORT + address = self.PythonMinioAddress(self.GLEANERIO_MINIO_ADDRESS, self.GLEANERIO_MINIO_PORT) + bucket = self.GLEANERIO_MINIO_BUCKET + release_url = f"{proto}://{address}/{bucket}/{path}/{source}_release.{extension}" + # BLAZEGRAPH SPECIFIC + # url = f"{_graphEndpoint()}?uri={release_url}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" + # get_dagster_logger().info(f'graph: insert "{source}" to {url} ') + # r = requests.post(url) + # log.debug(f' status:{r.status_code}') # status:404 + # get_dagster_logger().info(f'graph: insert: status:{r.status_code}') + # if r.status_code == 200: + # # '' + # if 'data modified="0"' in r.text: + # get_dagster_logger().info(f'graph: no data inserted ') + # raise Exception("No Data Added: " + r.text) + # return True + # else: + # get_dagster_logger().info(f'graph: error') + # raise Exception(f' graph: insert failed: status:{r.status_code}') + + ### GENERIC LOAD FROM + url = f"{graphendpoint}" # f"{os.environ.get('GLEANER_GRAPH_URL')}/namespace/{os.environ.get('GLEANER_GRAPH_NAMESPACE')}/sparql?uri={release_url}" + get_dagster_logger().info(f'graph: insert "{source}" to {url} ') + loadfrom = {'update': f'LOAD <{release_url}>'} + headers = { + 'Content-Type': 'application/x-www-form-urlencoded' + } + r = requests.post(url, headers=headers, data=loadfrom ) + get_dagster_logger().debug(f' status:{r.status_code}') # status:404 + get_dagster_logger().info(f'graph: LOAD from {release_url}: status:{r.status_code}') + if r.status_code == 200: + get_dagster_logger().info(f'graph load response: {str(r.text)} ') + # '' + if 'mutationCount=0' in r.text: + get_dagster_logger().info(f'graph: no data inserted ') + #raise Exception("No Data Added: " + r.text) + return True + else: + get_dagster_logger().info(f'graph: error {str(r.text)}') + raise Exception(f' graph: failed, LOAD from {release_url}: status:{r.status_code}') + +class BlazegraphResource(GraphResource): + pass + diff --git a/dagster/implnets/workflows/tasks/tasks/sch/s3_sensor.py b/dagster/implnets/workflows/tasks/tasks/sch/s3_sensor.py new file mode 100644 index 00000000..17ad93c7 --- /dev/null +++ b/dagster/implnets/workflows/tasks/tasks/sch/s3_sensor.py @@ -0,0 +1,46 @@ +from dagster import( + op, job, Config, get_dagster_logger, + sensor, RunRequest, RunConfig, + SensorEvaluationContext, asset_sensor, EventLogEntry, + SkipReason, + AssetKey, + static_partitioned_config, + DefaultSensorStatus +) +from dagster_aws.s3.sensor import get_s3_keys + +from ..jobs.jobs import tenant_asset_job + +@sensor(name="s3_config_source_sensor", + default_status=DefaultSensorStatus.RUNNING, + #, job_name="sources_updated_job", + job=tenant_asset_job, + required_resource_keys={"s3"}, + # minimum_interval_seconds=3600 + ) +def tenant_s3_sensor(context + ): + + gleaner_s3 = context.resources.s3 + + since_key = context.cursor or None + get_dagster_logger().info(f"sinceKey: {since_key}") + config_path=(f"{gleaner_s3.GLEANERIO_CONFIG_PATH}") + filename = f"{gleaner_s3.GLEANERIO_CONFIG_PATH}{gleaner_s3.GLEANERIO_TENANT_FILENAME}" + new_s3_keys = gleaner_s3.s3.get_client().head_object( + Bucket=gleaner_s3.GLEANERIO_MINIO_BUCKET, + Key=filename, + + ) + if not new_s3_keys: + return SkipReason(f"No new s3 files found for bucket {gleaner_s3.GLEANERIO_MINIO_BUCKET}. {filename}") + get_dagster_logger().info(f"metadata {new_s3_keys}") + #new_s3_keys = list(new_s3_keys) + last_key = str(new_s3_keys['LastModified']) + get_dagster_logger().info(f"last_modified: {last_key}") + run_requests =[] + if since_key is None or since_key < last_key: + #run_requests = [RunRequest(run_key=s3_key, run_config={}) for s3_key in new_s3_keys] + run_requests = [RunRequest(run_key=last_key, run_config={})] + context.update_cursor(last_key) + return run_requests diff --git a/dagster/implnets/workflows/tasks/tasks/sch/weekly_sch.py b/dagster/implnets/workflows/tasks/tasks/sch/weekly_sch.py new file mode 100644 index 00000000..304df6ed --- /dev/null +++ b/dagster/implnets/workflows/tasks/tasks/sch/weekly_sch.py @@ -0,0 +1,23 @@ +from dagster import schedule, RunRequest, ScheduleEvaluationContext, define_asset_job, AssetSelection + +load_analytics_job = define_asset_job("load_analytics_job", selection=AssetSelection.groups("load")) +graph_analytics_job = define_asset_job("graph_analytics_job", selection=AssetSelection.groups("graph")) + +@schedule(job=load_analytics_job, cron_schedule="@weekly") +def loadstats_schedule(context: ScheduleEvaluationContext): + + return RunRequest( + run_key=None, + run_config={} + + ) + +@schedule(job=graph_analytics_job, cron_schedule="@weekly") +def all_graph_stats_schedule(context: ScheduleEvaluationContext): + + return RunRequest( + run_key=None, + run_config={} + + ) + diff --git a/docs/README.md b/docs/README.md index d759a0e2..cff15560 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,4 +1,4 @@ -# Dagster +# Scheduler, AKA Dagster ## About @@ -15,142 +15,370 @@ basic view and doesn't present any scaling or fail over elements. The key elements are: -* sources to configuration and then the creation of the archive files that are loaded and used -to load into the Gleaner and Nabu tools +* sources to configuration to load into the Gleaner and Nabu tools, and push to the triplestore. These are now stored in +an s3 location + * gleaner configuration. a list of sources to load. (NOTE: This is also a docker config that needs to be updated to mactch to make things work) + * tenant configuration. a list communities, and which sources they load * The Dagster set which loads three containers to support workflow operations * The Gleaner Architecture images which loads three or more containers to support * s3 object storage * graph database (triplestore) * headless chrome for page rendering to support dynamically inserted JSON-LD * any other support packages like text, semantic or spatial indexes -* The GleanerIO tools which loads two containers as services (Gleaner and Nabu) that are run -and removed by the Dagster workflow -![upper level](images/gleanerDagster.svg) +### WORKFLOWS + +There are three workflows +* ingest works to load sources +* tasks weekly task +* ecrr - loads Earthcube Resource Registry + + + +```mermaid +--- +title: Dagster Stack +--- +flowchart LR + subgraph DockerCompose[Docker Compose Stacks] + maincompose[dagster/implents/deployment/compose_project.yaml] + project_overrides[dagster/implnets/deployment/compose_project_eco_override.yaml] + end + + subgraph Config + subgraph s3 + gleanconfig[gleanerconfig.yaml] + tenant[tenant.yaml] + + end + + subgraph Dagster/Config + workflow[ eco-wf ] + container-config-gleaner[gleanerio contaianer config] + container-config-nabu[gleanerio container config for nabu] + end + env['environment variables'] + + end + subgraph docker[docker managed by portainer] + + subgraph Containers + dagit + dagster + postgres + ingest + tasks + ecrr + end + config + subgraph Volumes + dagster-postgres + end + end + postgres--uses-->dagster-postgres + dagster--uses-->workflow + dagit--uses-->workflow + workflow-->config + maincompose--deploys-->dagit[dagster webserver] + maincompose--deploys-->dagster[dagster main] + maincompose--deploys-->ingest[gleanerio ingest code] + maincompose--deploys-->tasks[gleanerio task code] + project_overrides--deploys-->ecrr[earthcube code] + ingest--reads-->gleanconfig + ingest--reads-->tenant + tasks--reads-->gleanconfig + tasks--reads-->gleanconfig + dagster--uses-->postgres + +``` +#### basic deployment + +1. information for environment variables is created +2. The configuration files are created and loaded to s3, and docker/config +2. a docker stack is created, and the environment variables are added. +3. portainer deploys containers +4. when ingest and tasks are executed, they read + + +#### Ingest Workflow +```mermaid +--- +title: Ingest Workflow Sequence +--- +sequenceDiagram + participant S3 + participant Ingest + participant Portainer + participant Graph + S3->>Ingest: read sources from scheduler/configs/gleanerconfig.yaml + S3->>Ingest: read tenant from scheduler/configs/tenant.yaml + Ingest-->Ingest: create gleanerio container + Ingest->>Portainer: run gleanerio + Portainer-->Portainer: docker configs mounted in gleanerio container + Portainer-->Portainer: summon for sources + Portainer->>S3: jsonld to s3 + Portainer->>Ingest: logs returned + Ingest->>S3: logs from run to S3 + Ingest->>Ingest: create load reports using EC Utils + Ingest->>S3: load reports to s3 + Ingest->>Portainer: run nabu to + Portainer-->Portainer: convert jsonld to release and release summary + Portainer->>S3: release and release summary to s3 + Ingest->>Ingest: create graph report using EC Utils + Ingest->>S3: graph report to s3 + Ingest->>Graph: Create a namespaces for tenant + Ingest->>Graph: load release and release summary to namespaces +``` -### Template files - -The template files define the Dagster Ops, Jobs and Schedules. From these -and a GleanerIO config file a set of Python scripts for Dagster are created in -the output directory. - -These only need to be changed or used to regenerate if you wish to alter the -execution graph (ie, the ops, jobs and schedules) or change the config file. -In the later case only a regeneration needs to be done. +```mermaid +--- +title: Ingest Simplified Flowchart +--- +flowchart LR + subgraph config + s3_config_sensors + end + subgraph jobs + summon_and_release + tenant_release + end + subgraph assets + sources + tenants + end + + + + s3_config_sensors--monitors --> configs + s3_config_sensors--writes -->sources + s3_config_sensors--writes -->tenants + summon_and_release--uses-->sources --runs --> gleanerio + tenant_release--uses-->tenants --runs --> tenant_release + gleanerio--stores JSONLD -->summon + gleanerio--stores log -->logs + summon_and_release-- reads --> summon + summon_and_release-- converts to graph -->graph_path + tenant_release -- monitors --> graph_path + tenant_release -- loads releases to --> tenant_namespace + tenant_release -- loads releases to --> tenant_summary_namespace + + + subgraph portainer + gleanerio + tenant_ui + end + subgraph services + triplestore + tenant_namespace + tenant_summary_namespace + end + + subgraph minio_s3 + subgraph bucket_paths + subgraph scheduler + configs["`scheduler/configs`"] + logs + end + summon + graph_path['graph'] + end + end + + + + + +``` -There are then Docker build scripts to build out new containers. +#### Task workflows +```mermaid +--- +title: Task Workflow Sequence +--- +sequenceDiagram + participant S3 + participant Ingest + participant Portainer + participant Graph + Ingest->>Ingest: all_graph_stats assets: graph statistics using EC Utils + Ingest->>S3: load all_graph_stats to s3 + Ingest->>Ingest: source_stats assets: loadstatsHistory using EC Utils + Ingest->>Graph: sparql query to get graph stats + Graph->>Ingest: results for source_stats + Ingest->>S3: source_stats to s3 + +``` -See: [template](./implnets/src/implnet-example/templates) ## Steps to build and deploy -The deployment can be tested locally using docker. -The production 'containers' are built with a github action, or using a makefile. +The deployment can be tested locally. You can setup a services stack in docker to locally test, or use existing +services. + +The production 'containers' dagster, gleaner, and nabu are built with a github action. You can also use a makefile. This describes the local and container deployment We use portainer to manage our docker deployments. - -1) move to the the deployment directory -2) copy the envFile.env to .env +## Server Deployment. + [Production example for Earthcube](eco_deploy.md) + +## DEVELOPER Pycharm -- Run local with remote services +You can test components in pycharm. Run configurations for pycgharm are in runConfigurations (TODO: Instructions) +use the [ENVFIle plugin.](https://plugins.jetbrains.com/plugin/7861-envfile) +![pycharm runconfig](images/pycharm_runconfig.png) +1) move to the implnets/deployment directory +2) copy the envFile.env to .env [see](#environment-files) use the [ENVFIle plugin.](https://plugins.jetbrains.com/plugin/7861-envfile) +3) edit the entries to point at a portainer/traefik with running services +4) edit configuration files in implnets/configs/PROJECT: gleanerconfig.yaml, tenant.yaml +5) upload configuration implnets/configs/PROJECT to s3 scheduler/configs: gleanerconfig.yaml, tenant.yaml +4) run a Pycharm runconfig + 5) eg dagster_ingest_debug +4) go to http://localhost:3000/ +6) you can [test the schedules](#test-schedules) + +## full stack test Run local with remote services +1) move to the implnets/deployment directory +2) copy the envFile.env to .env [see](#environment-files)use the [ENVFIle plugin.](https://plugins.jetbrains.com/plugin/7861-envfile) [see](#environment-files) use the [ENVFIle plugin.](https://plugins.jetbrains.com/plugin/7861-envfile) 3) edit the entries. +4) edit configuration files in implnets/configs/PROJECT to s3: gleanerconfig.yaml, tenant.yaml +5) upload configuration implnets/configs/PROJECT to scheduler/configs s3: gleanerconfig.yaml, tenant.yaml 4) for local, `./dagster_localrun.sh` 5) go to http://localhost:3000/ To deploy in portainer, use the deployment/compose_project.yaml docker stack. ### docker compose Configuration: -1) there are three files that need to be installed into docker configs. - -| file | local | stack | note | -|--------------------|-------------------------------------| ------ |--------------------------| -| workspace | configs/PROJECT/worksapce.yaml | env () | used by dagster | -| gleanerconfig.yaml | configs/PROJECT/gleanerconfigs.yaml | env () | needs to be in portainer | -| nabuconfig.yaml | configs/PROJECT/nabuconfigs.yaml | env () | needs to be in portainer | -2) - -## Editing Template - -you can edit implnets/template - -then deploy with - -`pygen.py -cf ./configs/eco/gleanerconfig.yaml -od ./generatedCode/implnet-eco/output -td ./templates/v1 -d 7 `` - -If you are running using dagster_localrun.sh -1) go to the deployment at http://localhost:3000/locations -2) click 'reload on gleaner@project_grpc' -3) then if code is correct, then you will be able run the changed [workflows](http://localhost:3000/overview/jobs) - -(TODO NEEDS MORE -) - -## MAKEFILE -1) Place your gleanerconfig.yaml (use that exact name) in _confgis/NETWORK/gleanerconfig.yaml_ - 1) Note: When doing your docker build, you will use this NETWORK name as a value in the command such as - ```bash - podman build --tag="docker.io/fils/dagster_nsdf:$(VERSION)" --build-arg implnet=nsdf --file=./build/Dockerfile - ``` -1) Make any needed edits to the templates in directory _templates/v1/_ or make your own template set in that directory - -The command to build using the pygen.py program follows. This is done from the standpoint of running in from the -implenet directory. - -```bash - python pygen.py -cf ./configs/nsdf/gleanerconfig.yaml -od ./generatedCode/implnet-nsdf/output -td ./templates/v1 -d 7 -``` - -1) This will generate the code to build a dagster instance from the combination of the templates and gelanerconfig.yaml. -2) - - - - +there are configuration files that are needed. +They are installed in two places: +* as docker configs +* as scheduler configs in S3 + + (NOTE: I think the configs are still needed in the containers) + +| file | local | | note | +|--------------------|------------------------------------------|---------------------------------------------------|-----------------------------------------| +| workspace | configs/PROJECT/worksapce.yaml | dockerconfig: workspace | docker compose: used by dagster | +| gleanerconfig.yaml | configs/PROJECT/gleanerconfig.yaml | s3:{bucket}/scheduler/configs/gleanerconfigs.yaml | ingest workflow needs to be in minio/s3 +| tenant.yaml | configs/PROJECT/tenant.yaml | s3:{bucket}/scheduler/configs/tenant.yaml | ingest workflow needs to be in minio/s3 +| dagster.yaml | dagster/implnets/deployment/dagster.yaml | dockerconfig: dagster | docker compose: used by dagster +| gleanerconfig.yaml | configs/PROJECT/gleanerconfig.yaml | dockerconfig: gleaner | mounted in gleaner docker container +| nabuconfig.yaml | configs/PROJECT/nabuconfig.yaml | dockerconfig: nabu | mounted in gleaner docker container + +(NOTE: This is also a gleaner config (below in runtime configuration) that needs to be updated to mactch to make things work) + +[Docker Configs for gleanerio containers ](https://github.com/earthcube/scheduler/issues/106) are still needed: + +| file | local | stack | note | +|---------------------|-----------------------------------------------------------| ------ |---------------------------------------| +| gleanerconfig.yaml | configs/PROJECT/gleanerconfigs.yaml | env () | generated code needs to be in ~~portainer~~ | +| nabuconfig.yaml | configs/PROJECT/nabuconfigs.yaml | env () | generated codeneeds to be in ~~portainer~~ | + +3) when the containers are running in a stack, on portainer, there will need to + be updated by pulling from dockerhub. The ENV variables may need to be updated for the CONTAINER*_TAG + + +## Runtime configuration + +### upload to an s3 bucket + +| file | local | | note | +|--------------------|---------------------------------------------------| ------ |---------------------------------------| +| gleanerconfig.yaml | s3:{bucket}/scheduler/configs/gleanerconfigs.yaml | | ingest workflow needs to be in minio/s3 +| tenant.yaml | s3:{bucket}/scheduler/configs/enant.yaml | | ingest workflow needs to be in minio/s3 + +### updating config +You can update a config, and a sensor should pick up the changes. +1) Upload changed file to s3 + 2) note, if this is a new source, you need to add it to the docker config (gleaner-PROJECT). +2) go to overview, ![overview](images/overview_sensors_tab.png) +3) go to s3_config_source_sensor for gleanerconfig.yaml changes, and s3_config_tenant_sensor for tenant.yaml changes + ![sensor](images/sources_sensor.png). +4) at some point, a run should occur. ![run](images/runs.png). +5) then go to the sources_sensor, or tenant sensor +if job does not run, you can do a backfill. +#### new sources: +6) so to job tab, and run summon_and_release with the 'partitions' aka 'sources' that are recent. +7) click materialize_all, and in the backfill dialog be sure only the added partition is selected. ![backfill](images/materialize.png). +8) go to runs, and see that a job with a partition with that name is queued/running +9) run tenant_release_job with same partition name to load data to tenants +### +#### new tenants: +There are two jobs that need to run to move data to a tenant. (third will be needed for UI) +6) so to job tab, and run tenant_namespaces_job with the 'partitions' aka 'tenant' that are recent.' +7) click materialize_all, and be sure only the added partition is selected +8) go to runs, and see that a job with a partition with that name is queded,/running +6) so to job tab, and run tenant_release_job with the 'partitions' aka 'sources' for that tenant +7) click materialize_all, The data will be pushed to all tenant namespaces + +## test schedules + +![schedules tab](images/schedules_tab.png) +![schedules example](images/schedules_example.png) +![schedules select](images/schedules_select.png) +![schedules test](images/schedules_test.png) ### Environment files 1) cp deployment/envFile.env .env 2) edit 3) `export $(cat .env | xargs)` export $(cat .env | xargs) -``` bash +```yaml ###### # Nabu and Gleaner configs need to be in docker configs ## docker config name GLEANER_GLEANER_DOCKER_CONFIG ## docker config name GLEANER_NABU_DOCKER_CONFIG # suggested DOCKER_CONFIG NAMING PATTERN (nabu||gleaner)-{PROJECT} ######## -GLEANERIO_GLEANER_DOCKER_CONFIG=gleaner-eco -GLEANERIO_NABU_DOCKER_CONFIG=nabu-eco +GLEANERIO_DOCKER_GLEANER_CONFIG=gleaner-eco +GLEANERIO_DOCKER_NABU_CONFIG=nabu-eco # ### # workspace for dagster #### GLEANERIO_WORKSPACE_CONFIG_PATH=/usr/src/app/workspace.yaml -GLEANERIO_WORKSPACE_DOCKER_CONFIG=workspace-eco +GLEANERIO_DOCKER_WORKSPACE_CONFIG=workspace-eco + + + +DEBUG_CONTAINER=false + +#### HOST +# host base name for treafik. fixed to localhost:3000 when using compose_local. +HOST=localhost +# Applies only to compose_project.yaml runs +# modify SCHED_HOSTNAME is you want to run more than one instance +# aka two different project havests for now. +SCHED_HOSTNAME=sched -# NETWORK is needed for headless rendering -# gleaner +GLEANERIO_DOCKER_CONTAINER_WAIT_TIMEOUT=300 +# debugging set to 10 - 30 seconds -DEBUG=False PROJECT=eco #PROJECT=iow #PROJECT=oih -HOST=localhost +# tags for docker compose +CONTAINER_CODE_TAG=latest +CONTAINER_DAGSTER_TAG=latest + PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python # port is required: https://portainer.{HOST}:443/api/endpoints/2/docker/ -PORTAINER_URL= -PORTAINER_KEY= +GLEANERIO_DOCKER_URL=https://portainer.{HOST}:443/api/endpoints/2/docker/ +GLEANERIO_PORTAINER_APIKEY= +# if running dagster-dev, then this needs to be set , +# defaults to "/scheduler/gleanerconfig.yaml" which is path to config mounted in containers +# when debugging generated code "../../../configs/eco/gleanerconfig.yaml" +# when debugging code in workflows "../../configs/eco/gleanerconfig.yaml" +GLEANERIO_DAGSTER_CONFIG_PATH=../../../configs/eco/gleanerconfig.yaml # Network -GLEANERIO_HEADLESS_NETWORK=headless_gleanerio +GLEANERIO_DOCKER_HEADLESS_NETWORK=headless_gleanerio ### GLEANER/NABU Dockers -GLEANERIO_GLEANER_IMAGE=nsfearthcube/gleaner:latest -GLEANERIO_NABU_IMAGE=nsfearthcube/nabu:latest - - +GLEANERIO_GLEANER_IMAGE=nsfearthcube/gleaner:dev_ec +GLEANERIO_NABU_IMAGE=nsfearthcube/nabu:dev_eco ## # path where configs are deployed/mounted @@ -158,12 +386,8 @@ GLEANERIO_NABU_IMAGE=nsfearthcube/nabu:latest GLEANERIO_GLEANER_CONFIG_PATH=/gleaner/gleanerconfig.yaml GLEANERIO_NABU_CONFIG_PATH=/nabu/nabuconfig.yaml ### - - - - - -#GLEANERIO_LOG_PREFIX=scheduler/logs/ +#path in s3 for docker log files +GLEANERIO_LOG_PREFIX=scheduler/logs/ GLEANERIO_MINIO_ADDRESS= GLEANERIO_MINIO_PORT=80 @@ -173,100 +397,32 @@ GLEANERIO_MINIO_ACCESS_KEY= GLEANERIO_MINIO_SECRET_KEY= GLEANERIO_HEADLESS_ENDPOINT=http://headless:9222 - - # just the base address, no namespace https://graph.geocodes-aws-dev.earthcube.org/blazegraph -GLEANERIO_GRAPH_URL= -GLEANERIO_GRAPH_NAMESPACE= - - -``` - -# Implementation Networks - -This ([https://github.com/sharmasagar25/dagster-docker-example](https://github.com/sharmasagar25/dagster-docker-example)) -is an example on how to structure a [Dagster] project in order to organize -the jobs, repositories, schedules, and ops. The example also contains -examples on unit-tests and a docker-compose deployment file that utilizes a -Postgresql database for the run, event_log and schedule storage. - -This example should in no way be considered suitable for production and is -merely my own example of a possible file structure. I personally felt that it -was difficult to put the Dagster concepts to use since the projects own examples -had widely different structure and was difficult to overview as a beginner. - -The example is based on the official [tutorial]. - -## Folders +GLEANERIO_GRAPH_URL=https://graph.geocodes-aws.earthcube.org/blazegraph +GLEANERIO_GRAPH_NAMESPACE=mytest -* build: build directives for the docker containers -* configs -* src -* tooling +# optional: GLEANERIO_GRAPH_SUMMARY_ENDPOINT defaults to GLEANERIO_GRAPH_URL +#GLEANERIO_GRAPH_SUMMARY_ENDPOINT=https://graph.geocodes-aws-dev.earthcube.org/blazegraph +GLEANERIO_GRAPH_SUMMARY_NAMESPACE=mytest_summary +GLEANERIO_GRAPH_SUMMARIZE=True -## Requirements +# where are the gleaner and tennant configurations +GLEANERIO_CONFIG_PATH="scheduler/configs/" +GLEANERIO_TENANT_FILENAME="tenant.yaml" +GLEANERIO_SOURCES_FILENAME="gleanerconfig.yaml" -At this point it is expected that you have a valid Gleaner config file named -_gleanerconfig.yaml_ located in some path within the _configs_ directory. +# ECO Custom variables for ecrr +ECRR_GRAPH_NAMESPACE=ecrr +ECRR_MINIO_BUCKET=ecrr -## Building the dagster code from templates +# only a public slack channel works. DV has no permissions to create a new channel +#SLACK_CHANNEL="#production_discussion" +SLACK_CHANNEL="#twitterfeed" +SLACK_TOKEN= -The python program pygen will read a gleaner configuration file and a set of -template and build the Dagster code from there. - -```bash -python pygen.py -cf ./configs/nsdf/gleanerconfig.yaml -od ./src/implnet-nsdf/output -td ./src/implnet-nsdf/templates -d 7 -``` - - -## Running - -There is an example on how to run a single pipeline in `src/main.py`. First -install the dependencies in an isolated Python environment. - -```bash -pip install -r requirements ``` -The code built above can be run locally, though your templates may be set up -to reference services and other resources not present on your dev machine. For -complex examples like these, it can be problematic. - -If you are looking for some simple examples of Dagster, check out the directory -examples for some smaller self-contained workflows. There are good for testing -things like sensors and other approaches. - -If you wish to still try the generated code cd into the output directory -you specified in the pygen command. -Then use: - -```bash -dagit -h ghost.lan -w workspace.yaml -``` - -## Building - -```bash - podman build -t docker.io/fils/dagster:0.0.24 . -``` - -```bash - podman push docker.io/fils/dagster:0.0.24 -``` - - - -# Appendix - -## Setup - - -![orchestration](images/orchestrationInit.svg) - -## Docker API sequence - -![sequence](../docs/images/sequence.svg) ## Appendix @@ -278,27 +434,47 @@ at the documentation for [Accessing the Portainer API](https://docs.portainer.io ## Notes -Single file testing run -```bash - dagit -h ghost.lan -f test1.py -``` +### Handle Multiple Organizations -* Don't forget to set the DAGSTER_HOME dir like in +thoughts... -```bash - export DAGSTER_HOME=/home/fils/src/Projects/gleaner.io/scheduler/python/dagster -``` +* Each organization can be in a container with its own code workflow. + * in the workflows directory: `dagster project projectname` +* If we can standardize the loading and transforming workflows as much as possible, then the graph loading workflows + should be [standardized](https://github.com/earthcube/scheduler/issues/142). We could just define an additional container in a compose file, and add that to the workflows ``` -dagster-daemon run -``` - -Run from directory where workspace.yaml is. -``` -dagit --host 192.168.202.159 +load_from: +# - python_file: +# relative_path: "project/eco/repositories/repository.py" +# location_name: project +# working_directory: "./project/eco/" +# - python_file: +# relative_path: "workflows/ecrr/repositories/repository.py" +# working_directory: "./workflows/ecrr/" + # module starting out with the definitions api + # - python_module: "workflows.tasks.tasks" + + - grpc_server: + host: dagster-code-tasks + port: 4000 + location_name: "tasks" + - grpc_server: + host: dagster-code-eco-ingest + port: 4000 + location_name: "ingest" + - grpc_server: + host: dagster-code-ios-ingest + port: 4000 + location_name: "ingest" + - grpc_server: + host: dagster-code-eco-ecrr + port: 4000 + location_name: "ecrr" ``` +* to add a container, you need to edit the workflows.yaml in an organizations configuration ## Cron Notes @@ -336,7 +512,4 @@ We can then use the docker approach to run indexes on specific sources in these configuration files. -## References - -* [Simple Dagster example](https://bakerwho.github.io/posts/datascience/Deployable-Dagster-MVP/) diff --git a/docs/README_LOCAL_DEVELOPMENT.md b/docs/README_LOCAL_DEVELOPMENT.md index eb0b3c80..730ad7d4 100644 --- a/docs/README_LOCAL_DEVELOPMENT.md +++ b/docs/README_LOCAL_DEVELOPMENT.md @@ -52,17 +52,7 @@ You need to set the environment based on dagster/implnets/deployment/envFile.env will run just the task, and in editable form, i think. -### testing generated code -`cd generatedCode/PROJECT/output/` - -`export $(sed '/^[ \t]*#/d' ../../../deployment/.env | sed '/^$/d' | xargs)` - -`dagster dev` - -??? note - YOU CANNOT SET BREAKPOINTS IN TEMPLATES - YOU NEED TO cd generatedCode/PROJECT/output/jobs and set them in the job you are testing. ## TESTING CONTAINERS diff --git a/docs/add_containers.md b/docs/add_containers.md deleted file mode 100644 index 64e9f1e7..00000000 --- a/docs/add_containers.md +++ /dev/null @@ -1,22 +0,0 @@ -# Containers for Dagster Scheduler - -## Things to work on -* secrets -* modify compose to use a project variable -* network modify to use passed evn for network names - -Future: -* can we use a volume. Use git to pull? - -## Build Docker -Needs to be automated with a workflow - -something about archive file - -## add Stack. - dagster/deployment/compose.yaml make usre that it is the correct version. - image: docker.io/fils/dagster_eco:0.0.44 - -NOTE: we can use a env ${project} varibale like we do for geocodes_.._named.yaml - - diff --git a/docs/developement.md b/docs/developement_uisng_generated_code.md similarity index 88% rename from docs/developement.md rename to docs/developement_uisng_generated_code.md index a29e5d1d..8823b858 100644 --- a/docs/developement.md +++ b/docs/developement_uisng_generated_code.md @@ -1,5 +1,9 @@ # Scheduler Developement in Dagster + **NOTE: originally, a set of workflows was generated for each source. These were compiled into separate 'project' containers by a github workflo +This is no longer needed. But these were the original instructions.** + + !!! Note add [envfile](https://plugins.jetbrains.com/plugin/7861-envfile) plug in to PyCharm to allow for easy debugging to code @@ -102,12 +106,12 @@ GLEANERIO_WORKSPACE_CONFIG_PATH=/usr/src/app/workspace.yaml GLEANERIO_WORKSPACE_DOCKER_CONFIG=workspace-eco -# NETWORK is needed for headless rendering -# gleaner - - DEBUG=False +GLEANERIO_CONTAINER_WAIT_SECONDS=300 +# debuggin set to 5 or 10 seconds PROJECT=eco +CONTAINER_CODE_TAG=latest +CONTAINER_DAGSTER_TAG=latest #PROJECT=iow #PROJECT=oih HOST=localhost @@ -115,7 +119,13 @@ PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python # port is required: https://portainer.{HOST}:443/api/endpoints/2/docker/ PORTAINER_URL= PORTAINER_KEY= - +# if running dagster-dev, then this needs to be set , +# defaults to "/scheduler/gleanerconfig.yaml" which is path to config mounted in containers +# when debugging generated code "../../../configs/eco/gleanerconfig.yaml" +# when debugging code in workflows "../../configs/eco/gleanerconfig.yaml" +# DAGSTER_GLEANER_CONFIG_PATH=../../../configs/eco/gleanerconfig.yaml +GLEANERIO_CONTAINER_WAIT_SECONDS=3600 +#GLEANERIO_CONTAINER_WAIT_SECONDS=30 # Network GLEANERIO_HEADLESS_NETWORK=headless_gleanerio @@ -123,20 +133,14 @@ GLEANERIO_HEADLESS_NETWORK=headless_gleanerio GLEANERIO_GLEANER_IMAGE=nsfearthcube/gleaner:latest GLEANERIO_NABU_IMAGE=nsfearthcube/nabu:latest - - ## # path where configs are deployed/mounted #### GLEANERIO_GLEANER_CONFIG_PATH=/gleaner/gleanerconfig.yaml GLEANERIO_NABU_CONFIG_PATH=/nabu/nabuconfig.yaml ### - - - - - -#GLEANERIO_LOG_PREFIX=scheduler/logs/ +#path in s3 for docker log files +GLEANERIO_LOG_PREFIX=scheduler/logs/ GLEANERIO_MINIO_ADDRESS= GLEANERIO_MINIO_PORT=80 @@ -146,12 +150,14 @@ GLEANERIO_MINIO_ACCESS_KEY= GLEANERIO_MINIO_SECRET_KEY= GLEANERIO_HEADLESS_ENDPOINT=http://headless:9222 - - # just the base address, no namespace https://graph.geocodes-aws-dev.earthcube.org/blazegraph GLEANERIO_GRAPH_URL= GLEANERIO_GRAPH_NAMESPACE= +# example: https://graph.geocodes.ncsa.illinois.edu/blazegraph/namespace/yyearthcube2/sparql +#graph endpoint will be GLEANERIO_GRAPH_URL +GLEANERIO_SUMMARY_GRAPH_NAMESPACE= +GLEANERIO_SUMMARIZE_GRAPH=True ``` diff --git a/docs/eco_deploy.md b/docs/eco_deploy.md new file mode 100644 index 00000000..888e15f3 --- /dev/null +++ b/docs/eco_deploy.md @@ -0,0 +1,77 @@ +# ECO Scheduler Notes + +!!! Note + these will need to become the gleanerio scheduler documentation. + for now these are rough. Images and graphics need to be loaded + +```mermaid +flowchart TB +Postgres_Container-- defined by --> compose_project +Dagit_UI_Container-- defined by --> compose_project +Dagster_Container -- defined by --> compose_project +Headless_Container -- defined by --> compose_project +configs_volume_Container -- defined by --> compose_project +compose_project -- deployed to --> docker_portainer + +Gleaner_container -- image manual add --> docker_portainer +Nabu_container -- image manual add --> docker_portainer + +Gleaner_container -- deployed by --> Dagster_Container +Nabu_container -- deployed by --> Dagster_Container + +Gleaner_container-- deployed to --> docker_portainer +Nabu_container-- deployed to --> docker_portainer + +Dagit_UI_Container -- Created by --> Github_action +Dagster_Cotnainer -- Created by --> Github_action + +NabuConfig.tgz -- Archive to --> Nabu_container +GleanerConfig.tfz -- Archive to --> Gleaner_container + +NabuConfig.tgz -- Stored in s3 --> s3 +GleanerConfig.tfz -- Stored in s3 --> s3 + +configs_volume_Container -- populates volume --> dagster-project +dagster-project -- has --> gleanerConfig.yaml +dagster-project -- has --> nabuConfig.yaml +``` + +## Deploy + +### Deploy Dagster in Portainer +You will need to deploy dagster contiainers to portainer, for a docker swarm +0. get the portatinaer url, and auth token +0. SSH to the make hosting the docker. + +1. Pull scheduler repo +2. cd dagster/implnets/deployment +3. create a copy of envFile.env and **edit env variables** + 4. PROJECT=eco + 5. GLEANERIO_MINIO_ADDRESS ++ + 6. GLEANERIO_GRAPH_URL, GLEANERIO_GRAPH_NAMESPACE + 7. GLEANERIO_DOCKER_URL, GLEANERIO_PORTAINER_APIKEY + 8. SCHED_HOSTNAME defaults to sched +5. as noted as noted in (Compose, Environment and Docker API Assets), deploy the configuration to s3. +6. ~~create network and volumes needed `dagster_setup_docker.sh`~~ +7. manually add configs + 8. gleaner-{project} + 9. nabu-{project} + 10. workspace-{project} + 11. tenant-{project} + 11. dagster from:dagster/implnets/deployment/dagster.yaml +7. add configs to S3/Minio. + 8. scheduler/configs/gleanerconfig.yml + 9. scheduler/configs/tenant.yml +8. create a stack, + 9. gtibub repo: https://github.com/earthcube/scheduler.git + 10. branch: dev + 11. compose files: dagster/implnets/deployment/compose_project.yaml + 12. additional path: dagster/implnets/deployment/compose_project_eco_override.yaml + + + + + + + + diff --git a/docs/images/materialize.png b/docs/images/materialize.png new file mode 100644 index 00000000..a22a3f5f Binary files /dev/null and b/docs/images/materialize.png differ diff --git a/docs/images/overview_sensors_tab.png b/docs/images/overview_sensors_tab.png new file mode 100644 index 00000000..d6717a0c Binary files /dev/null and b/docs/images/overview_sensors_tab.png differ diff --git a/docs/images/pycharm_runconfig.png b/docs/images/pycharm_runconfig.png new file mode 100644 index 00000000..8a20630a Binary files /dev/null and b/docs/images/pycharm_runconfig.png differ diff --git a/docs/images/runs.png b/docs/images/runs.png new file mode 100644 index 00000000..570d703b Binary files /dev/null and b/docs/images/runs.png differ diff --git a/docs/images/schedules.png b/docs/images/schedules.png new file mode 100644 index 00000000..0fd65ff0 Binary files /dev/null and b/docs/images/schedules.png differ diff --git a/docs/images/schedules_example.png b/docs/images/schedules_example.png new file mode 100644 index 00000000..d51638ac Binary files /dev/null and b/docs/images/schedules_example.png differ diff --git a/docs/images/schedules_select.png b/docs/images/schedules_select.png new file mode 100644 index 00000000..62b4d382 Binary files /dev/null and b/docs/images/schedules_select.png differ diff --git a/docs/images/schedules_tab.png b/docs/images/schedules_tab.png new file mode 100644 index 00000000..2366a838 Binary files /dev/null and b/docs/images/schedules_tab.png differ diff --git a/docs/images/schedules_test.png b/docs/images/schedules_test.png new file mode 100644 index 00000000..42af1f26 Binary files /dev/null and b/docs/images/schedules_test.png differ diff --git a/docs/images/sources_sensor.png b/docs/images/sources_sensor.png new file mode 100644 index 00000000..43361cf4 Binary files /dev/null and b/docs/images/sources_sensor.png differ diff --git a/docs/ingest_workflow.md b/docs/ingest_workflow.md new file mode 100644 index 00000000..6cb5438c --- /dev/null +++ b/docs/ingest_workflow.md @@ -0,0 +1,78 @@ +# ingest workflow + +This is found in implnets/workflows/ingest + + +```mermaid +flowchart LR + subgraph dagster + subgraph sensors + s3_config_sources_sensor['sources_all_active'] + s3_config_tenant_sensor['tenant with sources'] + sources_sensor + release_file_sensor + tenant_names_sensor + tenant_namespaces_job + end + subgraph jobs + summon_and_release + sources_config_updated + tenant_release + tenant_config_updated + end + subgraph assets + source_names_active + sources_all + tenant_names + tenant_all + end + + + end + s3_config_sources_sensor--monitors --> sources_config + s3_config_tenant_sensor--monitors -->tenant_config + s3_config_sources_sensor--starts-->sources_config_updated + sources_config_updated--materializes-->source_names_active + sources_config_updated--materializes-->sources_all + s3_config_tenant_sensor--starts-->tenant_config_updated + tenant_config_updated--creates-->tenant_names + tenant_config_updated--creates-->tenant_all + sources_sensor--monitors-->sources_all + sources_sensor--starts-->summon_and_release + summon_and_release--starts--> gleanerio + gleanerio-->summon + gleanerio-->graph_path + tenant_names-->tenant_names_sensor + tenant_names_sensor--starts-->tenant_namespaces_job + tenant_namespaces_job--creates--> tenant_namespace + tenant_namespaces_job--creates-->tenant_summary_namespace + release_file_sensor--monitors-->graph_path + release_file_sensor--loads-->tenant_namespace + release_file_sensor--loads-->tenant_summary_namespace + + subgraph portainer + gleanerio + tenant_ui + subgraph services + subgraph triplestore + tenant_namespace + tenant_summary_namespace + end + + subgraph minio_s3 + subgraph bucket_paths + subgraph scheduler + sources_config["`scheduler/configs/gleanerconfig.yaml`"] + tenant_config["`scheduler/configs/tenant.yaml`"] + logs + end + summon + graph_path['graph'] + end + end + + end + end + + +``` diff --git a/docs/quick.md b/docs/quick.md index 5091a718..7da525de 100644 --- a/docs/quick.md +++ b/docs/quick.md @@ -1,56 +1,37 @@ # Notes -## Implementation network builder +### Run Deploy Dagster locally (ROUGH) +Dagster needs a docker instance to run Gleanerio. We usually do this in a remote container. +Basically, you can run a single workflow with the UI from that workflows directory with a `dagster run` + +You will need to deploy dagster contiainers to portainer, for a docker swarm +0. get the portatinaer url, and auth token +0. SSH to the make hosting the docker. + +1. Pull scheduler repo +2. cd dagster/implnets/deployment +3. create a copy of envFile.env and **edit env variables** + 4. PROJECT=eco + 5. GLEANERIO_MINIO_ADDRESS ++ + 6. GLEANERIO_GRAPH_URL, GLEANERIO_GRAPH_NAMESPACE + 7. GLEANERIO_DOCKER_URL, GLEANERIO_PORTAINER_APIKEY + 8. SCHED_HOSTNAME defaults to sched +5. as noted as noted in (Compose, Environment and Docker API Assets), deploy the configuration to s3. +6. ~~create network and volumes needed `dagster_setup_docker.sh`~~ +7. manually add configs + 8. gleaner-{project} + 9. nabu-{project} + 10. workspace-{project} + 11. tenant-{project} + 11. dagster from:dagster/implnets/deployment/dagster.yaml +7. add configs to S3/Minio. + 8. scheduler/configs/gleanerconfig.yml + 9. scheduler/configs/tenant.yml +8. then you can run a command. in runCOnfigs there are PyCharm run files + 9. set ENV + 10. GLEANERIO_GLEANER_CONFIG_PATH=/Users/valentin/development/dev_earthcube/scheduler/dagster/implnets/configs/eco/gleanerconfig.yaml + 11. `cd dagster/implnets/workflows/ingest` + 12. `dagster run` + +**NEED MORE EXAMPLES** -The work for building the dagster containers for a given implementation network starts in -the directory ```scheduler/dagster/implnets```. At this time most of this can be driven by -the Makefile. - -1) Make sure your gleanerconfig.yaml file is in the configs/NETWORK directory where - NETWORK is your implmentation network like eco, iow, etc. -2) Check the VERSION file and make sure it has a value you want in it to be tagged to the containers. -3) ```make eco-clean``` will remove any existing generated code from the ./generatedCode/implnet-NETWORK directory -4) ```make eco-generate``` will build the code new. Set the -d N in the makefile to a value N that is the number - of days you want the runs to cycle over. So 30 would mean they run once every 30 days. If you want some providers - to index at different rates you currently need to go in and edit the associated provider _schedules_ file editing the - line ```@schedule(cron_schedule="0 12 * * 6", job=implnet_job_amgeo, execution_timezone="US/Central")``` with a - cron value you want. -5) ```make eco-build``` builds the Docker images following the build file ./build/Docker file. Note this uses the - command line argument ```--build-arg implnet=eco``` to set the implementation NETWORK so that the correct build code - from _generatedCode/NETWORK_ is copied over -6) ```make eco-push``` push to your container registry of choice, here docker.io - - - -## Compose, Environment and Docker API Assets - -1) You will need the (or need to make) the portainer access token - from your https://portainer.geocodes-aws-dev.earthcube.org/#!/account -2) You will need a valid Gleaner configuration file named gleanerconfig.yaml and a nabu config named nabuconfig.yaml -3) You will need the schema.org context files places in a directory _assets_ get each of the http and https versions - 1) ```wget https://schema.org/version/latest/schemaorg-current-https.jsonld``` - 2) ```wget https://schema.org/version/latest/schemaorg-current-http.jsonld``` -4) Generate the archive files for Gleaner and Nabu. Note the path to the context - files should map with what is in the configuration files - 1) ```tar -zcf ./archives/NabuCfg.tgz ./nabuconfig.yaml ./assets``` - 2) ```tar -zcf ./archives/GleanerCfg.tgz ./gleanerconfig.yaml ./assets``` -5) The archives .tgz files named _NabuCfg.tgz_ and _GleanerCfg.tgz_ need to be copied to the schedule prefix - in your bucket used for Gleaner - 1) ```mc cp GleanerCfg.tgz NabuCfg.tgz gleaner/scheduler/configs``` - 2) Make sure GLEANERIO_NABU_ARCHIVE_OBJECT and GLEANERIO_GLEANER_ARCHIVE_OBJECT reflect this location in the .env file -6) Next you will need to build the scheduler containers for your implementation network. Push these containers - to your container registry of choice and make sure the values are set in the .env file and that - the containers are available to Portainer or will get pulled on use. These are the image files in the - compose file and also the images notes in the environment variables GLEANERIO_GLEANER_IMAGE and GLEANERIO_NABU_IMAGE - in the .env file. - - -At this point you are ready to move to your Docker or Portainer environment and deploy the -compose and environment files. - -## Notes - -1) I do not have the API call to ensure/check/pull and image used by the API, so these images need to be - pulled down manually at this time. These are the images noted by the .env files at - ```GLEANERIO_GLEANER_IMAGE=fils/gleaner:v3.0.11-development-df``` and - ```GLEANERIO_NABU_IMAGE=fils/nabu:2.0.8-development``` diff --git a/docs/refactor.md b/docs/refactor.md new file mode 100644 index 00000000..dc9dd359 --- /dev/null +++ b/docs/refactor.md @@ -0,0 +1,33 @@ +# refactoring + +## rework code to generate less. +* ~~just generate the graph, and some configuration loading~~ done +* ~~pass the ops the configuration~~ + +## can we use s3 manager to store some assets? +* ~~reports seems like the ideal use case for these.~~ works + +## handle multiple workflows +* ~~need to add ability to deploy some other workflows~~ works + + +### Handle Multiple Organizations +* Each organization can be in a container with its own code workflow. +* If we can standardize the loading and transforming workflows as much as possible, then the graph loading workflows + should be more customizable +* to add a container, you need to edit the workflows.yaml in an organizations configuration + +### possible workflows +* ~~timeseries after final graph~~ done + * ~~generate a csv of the load reports size of (sitemap, summoned, summon failure, milled, loaded to graph, datasets)~~ + +* ~~weekly summary~~ done + * ~~what is the size of the graph this week.~~ +* post s3 check, as an approval check. + * do these not contain JSONLD + * store as asset, or maybe have file we publish as 'approved/expected non-summoned +* sitemap check + * just run a sitemap head to see that url work, and exist, weekly. + * publish as paritioned data in s3 ;) +* shacl... should we shacl releases. + * if so, then maybe teach dagster to watch the graph/latest for changes. diff --git a/mkdocs.yml b/mkdocs.yml index b41edfc5..a5b5a815 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -9,7 +9,7 @@ theme: - navigation.sections markdown_extensions: - toc: - permalink:  + permalink: - attr_list - def_list - tables @@ -26,9 +26,9 @@ plugins: - multirepo: # (optional) tells multirepo to cleanup the temporary directory after site is built. cleanup: true - - mermaid2 + - mermaid2 - literate-nav - - mkdocs-jupyter + - mkdocs-jupyter - callouts # get a NoneType error, even when trying to generate in Geocodes-Metadata # - schema_reader: @@ -36,7 +36,15 @@ plugins: # - "./docs/GeoCODES-Metadata/schemas/" nav: - Gleaner IO Scheduler: - - Dagster: README.md + - Scheduler: README.md + - Ingest Workflow: ingest_workflow.md - Quick: quick.md - Add Containers: add_containers.md + - Scheduler Deployment: eco_deploy.md + + - Develeopment: + - Local Developement: README_LOCAL_DEVELOPMENT.md + - Developing Schedules: developement.md + - Troubleshooting Workflows: monitoring_workflows.md + diff --git a/runConfigurations/compose_local.yaml_ Compose Deployment.run.xml b/runConfigurations/compose_local.yaml_ Compose Deployment.run.xml new file mode 100644 index 00000000..e5764469 --- /dev/null +++ b/runConfigurations/compose_local.yaml_ Compose Deployment.run.xml @@ -0,0 +1,11 @@ + + + + + + + + + \ No newline at end of file diff --git a/runConfigurations/dagster_eco_debug .run.xml b/runConfigurations/dagster_eco_debug .run.xml index 324befdc..13969f7a 100644 --- a/runConfigurations/dagster_eco_debug .run.xml +++ b/runConfigurations/dagster_eco_debug .run.xml @@ -1,13 +1,15 @@ + - \ No newline at end of file + diff --git a/runConfigurations/dagster_eco_implnet_job_geocodes_demo_datasets.run.xml b/runConfigurations/dagster_eco_implnet_job_geocodes_demo_datasets.run.xml new file mode 100644 index 00000000..9383d2e6 --- /dev/null +++ b/runConfigurations/dagster_eco_implnet_job_geocodes_demo_datasets.run.xml @@ -0,0 +1,37 @@ + + + + + \ No newline at end of file diff --git a/runConfigurations/dagster_ecrr__job examples.run.xml b/runConfigurations/dagster_ecrr__job examples.run.xml new file mode 100644 index 00000000..d5b79bf8 --- /dev/null +++ b/runConfigurations/dagster_ecrr__job examples.run.xml @@ -0,0 +1,40 @@ + + + + + \ No newline at end of file diff --git a/runConfigurations/dagster_ecrr_debug .run.xml b/runConfigurations/dagster_ecrr_debug .run.xml new file mode 100644 index 00000000..00996fe8 --- /dev/null +++ b/runConfigurations/dagster_ecrr_debug .run.xml @@ -0,0 +1,40 @@ + + + + + \ No newline at end of file diff --git a/runConfigurations/dagster_ecrr_run list.run.xml b/runConfigurations/dagster_ecrr_run list.run.xml new file mode 100644 index 00000000..335ca80c --- /dev/null +++ b/runConfigurations/dagster_ecrr_run list.run.xml @@ -0,0 +1,40 @@ + + + + + \ No newline at end of file diff --git a/runConfigurations/dagster_ingest_debug (1).run.xml b/runConfigurations/dagster_ingest_debug (1).run.xml new file mode 100644 index 00000000..f8db179a --- /dev/null +++ b/runConfigurations/dagster_ingest_debug (1).run.xml @@ -0,0 +1,38 @@ + + + + + \ No newline at end of file diff --git a/runConfigurations/dagster_ingest_materialize.run.xml b/runConfigurations/dagster_ingest_materialize.run.xml new file mode 100644 index 00000000..82f90cdf --- /dev/null +++ b/runConfigurations/dagster_ingest_materialize.run.xml @@ -0,0 +1,38 @@ + + + + + \ No newline at end of file diff --git a/runConfigurations/dagster_tasks_debug .run.xml b/runConfigurations/dagster_tasks_debug .run.xml new file mode 100644 index 00000000..627f0293 --- /dev/null +++ b/runConfigurations/dagster_tasks_debug .run.xml @@ -0,0 +1,37 @@ + + + + + diff --git a/runConfigurations/dagster_tasks_materialize loadstatsCommunity (1).run.xml b/runConfigurations/dagster_tasks_materialize loadstatsCommunity (1).run.xml new file mode 100644 index 00000000..3edc9ff7 --- /dev/null +++ b/runConfigurations/dagster_tasks_materialize loadstatsCommunity (1).run.xml @@ -0,0 +1,37 @@ + + + + + \ No newline at end of file diff --git a/runConfigurations/dagster_tasks_materialize loadstatsCommunity.run.xml b/runConfigurations/dagster_tasks_materialize loadstatsCommunity.run.xml new file mode 100644 index 00000000..ccac142f --- /dev/null +++ b/runConfigurations/dagster_tasks_materialize loadstatsCommunity.run.xml @@ -0,0 +1,37 @@ + + + + + \ No newline at end of file diff --git a/runConfigurations/dagster_tasks_materialize loadstatsHistory.run.xml b/runConfigurations/dagster_tasks_materialize loadstatsHistory.run.xml new file mode 100644 index 00000000..6f8cf336 --- /dev/null +++ b/runConfigurations/dagster_tasks_materialize loadstatsHistory.run.xml @@ -0,0 +1,37 @@ + + + + + \ No newline at end of file diff --git a/runConfigurations/dagster_tasks_materialize sos_types.run.xml b/runConfigurations/dagster_tasks_materialize sos_types.run.xml new file mode 100644 index 00000000..984baf8e --- /dev/null +++ b/runConfigurations/dagster_tasks_materialize sos_types.run.xml @@ -0,0 +1,37 @@ + + + + + \ No newline at end of file diff --git a/runConfigurations/dagster_tasks_materialize tennants.run.xml b/runConfigurations/dagster_tasks_materialize tennants.run.xml new file mode 100644 index 00000000..2c3cf813 --- /dev/null +++ b/runConfigurations/dagster_tasks_materialize tennants.run.xml @@ -0,0 +1,37 @@ + + + + + \ No newline at end of file diff --git a/runConfigurations/pygen eco v1.run.xml b/runConfigurations/pygen eco v1.run.xml index 2b180505..528a1abd 100644 --- a/runConfigurations/pygen eco v1.run.xml +++ b/runConfigurations/pygen eco v1.run.xml @@ -7,8 +7,9 @@